5 Commits
lab_12 ... main

Author SHA1 Message Date
Илья
110f79e4f5 cleaned comments 2025-06-17 12:27:34 +04:00
639e381daa Merge pull request 'lab_12' (#11) from lab_12 into main
Reviewed-on: #11
2025-05-15 16:08:15 +04:00
a610d16a7f Merge pull request 'lab_11' (#10) from lab_11 into main
Reviewed-on: #10
2025-05-15 16:08:07 +04:00
061214e244 small fixes and requirements 2025-05-03 17:56:58 +04:00
80e6ee0e8f lab 11 done 2025-05-03 17:38:40 +04:00
9 changed files with 1399 additions and 237 deletions

View File

@@ -328,7 +328,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 71, "execution_count": null,
"id": "9a7b0970", "id": "9a7b0970",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -356,7 +356,6 @@
"import random\n", "import random\n",
"\n", "\n",
"def create_individual(elements_num): \n", "def create_individual(elements_num): \n",
" # Генерирует случайную двоичную строку той же длины, что и список элементов\n",
" return [random.randint(0, 1) for _ in range(elements_num)]\n", " return [random.randint(0, 1) for _ in range(elements_num)]\n",
"\n", "\n",
"def create_population(elements_num, population_size): \n", "def create_population(elements_num, population_size): \n",
@@ -375,7 +374,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 72, "execution_count": null,
"id": "0809a0b1", "id": "0809a0b1",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -397,7 +396,6 @@
" if individual[i] == 1:\n", " if individual[i] == 1:\n",
" total_value += prices[i]\n", " total_value += prices[i]\n",
" total_weight += weights[i]\n", " total_weight += weights[i]\n",
" # Если общий вес превышает вместимость ранца, устанавливается значение 0 (неверное решение)\n",
" return total_value if total_weight <= capacity else 0\n", " return total_value if total_weight <= capacity else 0\n",
"\n", "\n",
"evaluate_fitness([0, 1, 1, 1, 0], [7, 12, 19, 13, 20], [10, 11, 18, 15, 5], 50)" "evaluate_fitness([0, 1, 1, 1, 0], [7, 12, 19, 13, 20], [10, 11, 18, 15, 5], 50)"
@@ -413,7 +411,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 73, "execution_count": null,
"id": "4d5a13d7", "id": "4d5a13d7",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -429,7 +427,6 @@
} }
], ],
"source": [ "source": [
"# одноточечный кроссинговер\n",
"def crossover(parent1, parent2):\n", "def crossover(parent1, parent2):\n",
" point = random.randint(1, len(parent1) - 1)\n", " point = random.randint(1, len(parent1) - 1)\n",
" return (parent1[:point] + parent2[point:], parent2[:point] + parent1[point:])\n", " return (parent1[:point] + parent2[point:], parent2[:point] + parent1[point:])\n",
@@ -447,7 +444,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 74, "execution_count": null,
"id": "66021b53", "id": "66021b53",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -467,7 +464,6 @@
"# Мутация 1: побитовая замена\n", "# Мутация 1: побитовая замена\n",
"def mutate_flip_bits(individual, mutation_rate):\n", "def mutate_flip_bits(individual, mutation_rate):\n",
" for i in range(len(individual)):\n", " for i in range(len(individual)):\n",
" # Сработает с некоторой вероятностью\n",
" if random.random() < mutation_rate:\n", " if random.random() < mutation_rate:\n",
" individual[i] = 1 - individual[i]\n", " individual[i] = 1 - individual[i]\n",
"\n", "\n",
@@ -497,24 +493,21 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 79, "execution_count": null,
"id": "17093d62", "id": "17093d62",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Параметры алгоритма\n",
"population_size = 100\n", "population_size = 100\n",
"num_generations = 10\n", "num_generations = 10\n",
"mutation_rate = 0.1\n", "mutation_rate = 0.1\n",
"mutation_strategy = 'flip'\n", "mutation_strategy = 'flip'\n",
"\n", "\n",
"# Выбор участников кроссинговера с помощью селекции на основе рулетки\n",
"def select_parents(population, weights, prices, capacity):\n", "def select_parents(population, weights, prices, capacity):\n",
" fitness_values = [evaluate_fitness(ind, weights, prices, capacity) for ind in population]\n", " fitness_values = [evaluate_fitness(ind, weights, prices, capacity) for ind in population]\n",
" total_fitness = sum(fitness_values)\n", " total_fitness = sum(fitness_values)\n",
" if total_fitness == 0:\n", " if total_fitness == 0:\n",
" return random.choice(population), random.choice(population)\n", " return random.choice(population), random.choice(population)\n",
" # чем выше значение фитнес-функции, тем больше шанс на выбор\n",
" probabilities = [f / total_fitness for f in fitness_values]\n", " probabilities = [f / total_fitness for f in fitness_values]\n",
" return random.choices(population, weights=probabilities, k=2)\n", " return random.choices(population, weights=probabilities, k=2)\n",
"\n", "\n",

1345
lab_11/lab11.ipynb Normal file

File diff suppressed because one or more lines are too long

BIN
lab_11/requirements.txt Normal file

Binary file not shown.

View File

@@ -15,7 +15,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 532, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -35,7 +35,6 @@
"import seaborn as sns\n", "import seaborn as sns\n",
"from sklearn.model_selection import train_test_split\n", "from sklearn.model_selection import train_test_split\n",
"\n", "\n",
"# вывод всех столбцов\n",
"df = pd.read_csv(\"..//..//static//csv//flavors_of_cacao.csv\")\n", "df = pd.read_csv(\"..//..//static//csv//flavors_of_cacao.csv\")\n",
"df.columns = df.columns.str.replace('\\n', '')\n", "df.columns = df.columns.str.replace('\\n', '')\n",
"print(df.columns)" "print(df.columns)"
@@ -85,7 +84,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 533, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -108,13 +107,10 @@
} }
], ],
"source": [ "source": [
"# Удаляем символ '%' и преобразуем столбец CocoaPercent в числовой формат\n",
"df['CocoaPercent'] = df['CocoaPercent'].str.replace('%', '').astype(float)\n", "df['CocoaPercent'] = df['CocoaPercent'].str.replace('%', '').astype(float)\n",
"\n", "\n",
"# Выбираем столбцы для анализа\n",
"columns_to_check = ['CocoaPercent', 'Rating']\n", "columns_to_check = ['CocoaPercent', 'Rating']\n",
"\n", "\n",
"# Функция для подсчета выбросов\n",
"def count_outliers(df, columns):\n", "def count_outliers(df, columns):\n",
" outliers_count = {}\n", " outliers_count = {}\n",
" for col in columns:\n", " for col in columns:\n",
@@ -124,20 +120,16 @@
" lower_bound = Q1 - 1.5 * IQR\n", " lower_bound = Q1 - 1.5 * IQR\n",
" upper_bound = Q3 + 1.5 * IQR\n", " upper_bound = Q3 + 1.5 * IQR\n",
" \n", " \n",
" # Считаем количество выбросов\n",
" outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]\n", " outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]\n",
" outliers_count[col] = len(outliers)\n", " outliers_count[col] = len(outliers)\n",
" \n", " \n",
" return outliers_count\n", " return outliers_count\n",
"\n", "\n",
"# Подсчитываем выбросы\n",
"outliers_count = count_outliers(df, columns_to_check)\n", "outliers_count = count_outliers(df, columns_to_check)\n",
"\n", "\n",
"# Выводим количество выбросов для каждого столбца\n",
"for col, count in outliers_count.items():\n", "for col, count in outliers_count.items():\n",
" print(f\"Количество выбросов в столбце '{col}': {count}\")\n", " print(f\"Количество выбросов в столбце '{col}': {count}\")\n",
"\n", "\n",
"# Создаем диаграммы размахов\n",
"plt.figure(figsize=(15, 10))\n", "plt.figure(figsize=(15, 10))\n",
"for i, col in enumerate(columns_to_check, 1):\n", "for i, col in enumerate(columns_to_check, 1):\n",
" plt.subplot(2, 2, i)\n", " plt.subplot(2, 2, i)\n",
@@ -158,7 +150,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 534, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -180,10 +172,8 @@
} }
], ],
"source": [ "source": [
"# Выбираем столбцы для очистки\n",
"columns_to_clean = ['CocoaPercent']\n", "columns_to_clean = ['CocoaPercent']\n",
"\n", "\n",
"# Функция для удаления выбросов\n",
"def remove_outliers(df, columns):\n", "def remove_outliers(df, columns):\n",
" for col in columns:\n", " for col in columns:\n",
" Q1 = df[col].quantile(0.25)\n", " Q1 = df[col].quantile(0.25)\n",
@@ -192,21 +182,15 @@
" lower_bound = Q1 - 1.5 * IQR\n", " lower_bound = Q1 - 1.5 * IQR\n",
" upper_bound = Q3 + 1.5 * IQR\n", " upper_bound = Q3 + 1.5 * IQR\n",
" \n", " \n",
" # Удаляем строки, содержащие выбросы\n",
" df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]\n", " df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]\n",
" \n", " \n",
" return df\n", " return df\n",
"\n", "\n",
"# Удаляем выбросы\n",
"df_cleaned = remove_outliers(df, columns_to_clean)\n", "df_cleaned = remove_outliers(df, columns_to_clean)\n",
"\n", "\n",
"# Выводим количество удаленных строк\n",
"print(f\"Количество удаленных строк: {len(df) - len(df_cleaned)}\")\n", "print(f\"Количество удаленных строк: {len(df) - len(df_cleaned)}\")\n",
"\n", "\n",
"# Создаем диаграммы размаха для очищенных данных\n",
"plt.figure(figsize=(15, 6))\n", "plt.figure(figsize=(15, 6))\n",
"\n",
"# Диаграмма размаха для CocoaPercent\n",
"plt.subplot(1, 2, 1)\n", "plt.subplot(1, 2, 1)\n",
"sns.boxplot(x=df_cleaned['CocoaPercent'])\n", "sns.boxplot(x=df_cleaned['CocoaPercent'])\n",
"plt.title('Box Plot of CocoaPercent (Cleaned)')\n", "plt.title('Box Plot of CocoaPercent (Cleaned)')\n",
@@ -215,7 +199,6 @@
"plt.tight_layout()\n", "plt.tight_layout()\n",
"plt.show()\n", "plt.show()\n",
"\n", "\n",
"# Сохраняем очищенный датасет\n",
"df_cleaned.to_csv(\"..//..//static//csv//flavors_of_cacao_cleaned.csv\", index=False)\n", "df_cleaned.to_csv(\"..//..//static//csv//flavors_of_cacao_cleaned.csv\", index=False)\n",
"df = df_cleaned" "df = df_cleaned"
] ]
@@ -231,7 +214,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 535, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -266,17 +249,14 @@
} }
], ],
"source": [ "source": [
"# Количество пустых значений признаков\n",
"print(df.isnull().sum())\n", "print(df.isnull().sum())\n",
"\n", "\n",
"print()\n", "print()\n",
"\n", "\n",
"# Есть ли пустые значения признаков\n",
"print(df.isnull().any())\n", "print(df.isnull().any())\n",
"\n", "\n",
"print()\n", "print()\n",
"\n", "\n",
"# Процент пустых значений признаков\n",
"for i in df.columns:\n", "for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n", " null_rate = df[i].isnull().sum() / len(df) * 100\n",
" if null_rate > 0:\n", " if null_rate > 0:\n",
@@ -292,7 +272,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 536, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -315,13 +295,10 @@
} }
], ],
"source": [ "source": [
"# Удаление пропущенных значений в столбцах BeanType и Broad BeanOrigin\n",
"df = df.dropna(subset=['BeanType', 'Broad BeanOrigin'])\n", "df = df.dropna(subset=['BeanType', 'Broad BeanOrigin'])\n",
"\n", "\n",
"# Проверка на пропущенные значения после удаления\n",
"missing_values_after_drop = df.isnull().sum()\n", "missing_values_after_drop = df.isnull().sum()\n",
"\n", "\n",
"# Вывод результатов после удаления\n",
"print(\"\\nКоличество пустых значений в каждом столбце после удаления:\")\n", "print(\"\\nКоличество пустых значений в каждом столбце после удаления:\")\n",
"print(missing_values_after_drop)" "print(missing_values_after_drop)"
] ]
@@ -337,7 +314,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 537, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -351,18 +328,13 @@
} }
], ],
"source": [ "source": [
"# Разделение на признаки (X) и целевую переменную (y)\n",
"# Предположим, что Rating - это целевая переменная\n",
"X = df.drop('Rating', axis=1)\n", "X = df.drop('Rating', axis=1)\n",
"y = df['Rating']\n", "y = df['Rating']\n",
"\n", "\n",
"# Разбиение на обучающую и остальную выборку (контрольную + тестовую)\n",
"X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, random_state=42)\n", "X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, random_state=42)\n",
"\n", "\n",
"# Разбиение остатка на контрольную и тестовую выборки\n",
"X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=42)\n", "X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=42)\n",
"\n", "\n",
"# Вывод размеров выборок\n",
"print(\"Размер обучающей выборки:\", X_train.shape)\n", "print(\"Размер обучающей выборки:\", X_train.shape)\n",
"print(\"Размер контрольной выборки:\", X_val.shape)\n", "print(\"Размер контрольной выборки:\", X_val.shape)\n",
"print(\"Размер тестовой выборки:\", X_test.shape)" "print(\"Размер тестовой выборки:\", X_test.shape)"
@@ -370,7 +342,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 538, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -426,7 +398,6 @@
} }
], ],
"source": [ "source": [
"# Функция для анализа сбалансированности\n",
"def analyze_balance(y_train, y_val, y_test):\n", "def analyze_balance(y_train, y_val, y_test):\n",
" print(\"Распределение классов в обучающей выборке:\")\n", " print(\"Распределение классов в обучающей выборке:\")\n",
" print(y_train.value_counts(normalize=True))\n", " print(y_train.value_counts(normalize=True))\n",
@@ -437,7 +408,6 @@
" print(\"\\nРаспределение классов в тестовой выборке:\")\n", " print(\"\\nРаспределение классов в тестовой выборке:\")\n",
" print(y_test.value_counts(normalize=True))\n", " print(y_test.value_counts(normalize=True))\n",
"\n", "\n",
"# Анализ сбалансированности\n",
"analyze_balance(y_train, y_val, y_test)" "analyze_balance(y_train, y_val, y_test)"
] ]
}, },
@@ -465,7 +435,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 539, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -485,7 +455,6 @@
"from sklearn.model_selection import train_test_split\n", "from sklearn.model_selection import train_test_split\n",
"from imblearn.over_sampling import SMOTE\n", "from imblearn.over_sampling import SMOTE\n",
"\n", "\n",
"# вывод всех столбцов\n",
"df = pd.read_csv(\"..//..//static//csv//water_potability.csv\")\n", "df = pd.read_csv(\"..//..//static//csv//water_potability.csv\")\n",
"print(df.columns)" "print(df.columns)"
] ]
@@ -538,7 +507,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 540, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -562,10 +531,8 @@
} }
], ],
"source": [ "source": [
"# Выбираем столбцы для анализа\n",
"columns_to_check = ['Hardness', 'Solids', 'Organic_carbon']\n", "columns_to_check = ['Hardness', 'Solids', 'Organic_carbon']\n",
"\n", "\n",
"# Функция для подсчета выбросов\n",
"def count_outliers(df, columns):\n", "def count_outliers(df, columns):\n",
" outliers_count = {}\n", " outliers_count = {}\n",
" for col in columns:\n", " for col in columns:\n",
@@ -575,20 +542,16 @@
" lower_bound = Q1 - 1.5 * IQR\n", " lower_bound = Q1 - 1.5 * IQR\n",
" upper_bound = Q3 + 1.5 * IQR\n", " upper_bound = Q3 + 1.5 * IQR\n",
" \n", " \n",
" # Считаем количество выбросов\n",
" outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]\n", " outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]\n",
" outliers_count[col] = len(outliers)\n", " outliers_count[col] = len(outliers)\n",
" \n", " \n",
" return outliers_count\n", " return outliers_count\n",
"\n", "\n",
"# Подсчитываем выбросы\n",
"outliers_count = count_outliers(df, columns_to_check)\n", "outliers_count = count_outliers(df, columns_to_check)\n",
"\n", "\n",
"# Выводим количество выбросов для каждого столбца\n",
"for col, count in outliers_count.items():\n", "for col, count in outliers_count.items():\n",
" print(f\"Количество выбросов в столбце '{col}': {count}\")\n", " print(f\"Количество выбросов в столбце '{col}': {count}\")\n",
"\n", "\n",
"# Создаем диаграммы размахов\n",
"plt.figure(figsize=(15, 10))\n", "plt.figure(figsize=(15, 10))\n",
"for i, col in enumerate(columns_to_check, 1):\n", "for i, col in enumerate(columns_to_check, 1):\n",
" plt.subplot(2, 2, i)\n", " plt.subplot(2, 2, i)\n",
@@ -607,7 +570,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 541, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -629,10 +592,8 @@
} }
], ],
"source": [ "source": [
"# Выбираем столбцы для очистки\n",
"columns_to_clean = ['Hardness', 'Solids', 'Organic_carbon']\n", "columns_to_clean = ['Hardness', 'Solids', 'Organic_carbon']\n",
"\n", "\n",
"# Функция для удаления выбросов\n",
"def remove_outliers(df, columns):\n", "def remove_outliers(df, columns):\n",
" for col in columns:\n", " for col in columns:\n",
" Q1 = df[col].quantile(0.25)\n", " Q1 = df[col].quantile(0.25)\n",
@@ -641,33 +602,25 @@
" lower_bound = Q1 - 1.5 * IQR\n", " lower_bound = Q1 - 1.5 * IQR\n",
" upper_bound = Q3 + 1.5 * IQR\n", " upper_bound = Q3 + 1.5 * IQR\n",
" \n", " \n",
" # Удаляем строки, содержащие выбросы\n",
" df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]\n", " df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]\n",
" \n",
" return df\n", " return df\n",
"\n", "\n",
"# Удаляем выбросы\n",
"df_cleaned = remove_outliers(df, columns_to_clean)\n", "df_cleaned = remove_outliers(df, columns_to_clean)\n",
"\n", "\n",
"# Выводим количество удаленных строк\n",
"print(f\"Количество удаленных строк: {len(df) - len(df_cleaned)}\")\n", "print(f\"Количество удаленных строк: {len(df) - len(df_cleaned)}\")\n",
"\n", "\n",
"# Создаем диаграммы размаха для очищенных данных\n",
"plt.figure(figsize=(15, 6))\n", "plt.figure(figsize=(15, 6))\n",
"\n", "\n",
"# Диаграмма размаха для Hardness\n",
"plt.subplot(1, 3, 1)\n", "plt.subplot(1, 3, 1)\n",
"sns.boxplot(x=df_cleaned['Hardness'])\n", "sns.boxplot(x=df_cleaned['Hardness'])\n",
"plt.title('Box Plot of Hardness (Cleaned)')\n", "plt.title('Box Plot of Hardness (Cleaned)')\n",
"plt.xlabel('Hardness')\n", "plt.xlabel('Hardness')\n",
"\n", "\n",
"# Диаграмма размаха для Solids\n",
"plt.subplot(1, 3, 2)\n", "plt.subplot(1, 3, 2)\n",
"sns.boxplot(x=df_cleaned['Solids'])\n", "sns.boxplot(x=df_cleaned['Solids'])\n",
"plt.title('Box Plot of Solids (Cleaned)')\n", "plt.title('Box Plot of Solids (Cleaned)')\n",
"plt.xlabel('Solids')\n", "plt.xlabel('Solids')\n",
"\n", "\n",
"# Диаграмма размаха для Organic_carbon\n",
"plt.subplot(1, 3, 3)\n", "plt.subplot(1, 3, 3)\n",
"sns.boxplot(x=df_cleaned['Organic_carbon'])\n", "sns.boxplot(x=df_cleaned['Organic_carbon'])\n",
"plt.title('Box Plot of Organic_carbon (Cleaned)')\n", "plt.title('Box Plot of Organic_carbon (Cleaned)')\n",
@@ -676,7 +629,6 @@
"plt.tight_layout()\n", "plt.tight_layout()\n",
"plt.show()\n", "plt.show()\n",
"\n", "\n",
"# Сохраняем очищенный датасет\n",
"df_cleaned.to_csv(\"..//..//static//csv//water_potability_cleaned.csv\", index=False)\n", "df_cleaned.to_csv(\"..//..//static//csv//water_potability_cleaned.csv\", index=False)\n",
"df = df_cleaned" "df = df_cleaned"
] ]
@@ -692,7 +644,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 542, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -730,17 +682,14 @@
} }
], ],
"source": [ "source": [
"# Количество пустых значений признаков\n",
"print(df.isnull().sum())\n", "print(df.isnull().sum())\n",
"\n", "\n",
"print()\n", "print()\n",
"\n", "\n",
"# Есть ли пустые значения признаков\n",
"print(df.isnull().any())\n", "print(df.isnull().any())\n",
"\n", "\n",
"print()\n", "print()\n",
"\n", "\n",
"# Процент пустых значений признаков\n",
"for i in df.columns:\n", "for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n", " null_rate = df[i].isnull().sum() / len(df) * 100\n",
" if null_rate > 0:\n", " if null_rate > 0:\n",
@@ -756,7 +705,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 543, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -780,15 +729,12 @@
} }
], ],
"source": [ "source": [
"# Замена значений\n",
"df[\"ph\"] = df[\"ph\"].fillna(df[\"ph\"].median())\n", "df[\"ph\"] = df[\"ph\"].fillna(df[\"ph\"].median())\n",
"df[\"Sulfate\"] = df[\"Sulfate\"].fillna(df[\"Sulfate\"].median())\n", "df[\"Sulfate\"] = df[\"Sulfate\"].fillna(df[\"Sulfate\"].median())\n",
"df[\"Trihalomethanes\"] = df[\"Trihalomethanes\"].fillna(df[\"Trihalomethanes\"].median())\n", "df[\"Trihalomethanes\"] = df[\"Trihalomethanes\"].fillna(df[\"Trihalomethanes\"].median())\n",
"\n", "\n",
"# Проверка на пропущенные значения после замены\n",
"missing_values_after_drop = df.isnull().sum()\n", "missing_values_after_drop = df.isnull().sum()\n",
"\n", "\n",
"# Вывод результатов после замены\n",
"print(\"\\nКоличество пустых значений в каждом столбце после замены:\")\n", "print(\"\\nКоличество пустых значений в каждом столбце после замены:\")\n",
"print(missing_values_after_drop)" "print(missing_values_after_drop)"
] ]
@@ -804,7 +750,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 544, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -818,18 +764,13 @@
} }
], ],
"source": [ "source": [
"# Разделение на признаки (X) и целевую переменную (y)\n",
"# Предположим, что Potability - это целевая переменная\n",
"X = df.drop('Potability', axis=1)\n", "X = df.drop('Potability', axis=1)\n",
"y = df['Potability']\n", "y = df['Potability']\n",
"\n", "\n",
"# Разбиение на обучающую и остальную выборку (контрольную + тестовую)\n",
"X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, random_state=42)\n", "X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, random_state=42)\n",
"\n", "\n",
"# Разбиение остатка на контрольную и тестовую выборки\n",
"X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=42)\n", "X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=42)\n",
"\n", "\n",
"# Вывод размеров выборок\n",
"print(\"Размер обучающей выборки:\", X_train.shape)\n", "print(\"Размер обучающей выборки:\", X_train.shape)\n",
"print(\"Размер контрольной выборки:\", X_val.shape)\n", "print(\"Размер контрольной выборки:\", X_val.shape)\n",
"print(\"Размер тестовой выборки:\", X_test.shape)" "print(\"Размер тестовой выборки:\", X_test.shape)"
@@ -837,7 +778,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 545, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -865,7 +806,6 @@
} }
], ],
"source": [ "source": [
"# Функция для анализа сбалансированности\n",
"def analyze_balance(y_train, y_val, y_test):\n", "def analyze_balance(y_train, y_val, y_test):\n",
" print(\"Распределение классов в обучающей выборке:\")\n", " print(\"Распределение классов в обучающей выборке:\")\n",
" print(y_train.value_counts(normalize=True))\n", " print(y_train.value_counts(normalize=True))\n",
@@ -876,7 +816,6 @@
" print(\"\\nРаспределение классов в тестовой выборке:\")\n", " print(\"\\nРаспределение классов в тестовой выборке:\")\n",
" print(y_test.value_counts(normalize=True))\n", " print(y_test.value_counts(normalize=True))\n",
"\n", "\n",
"# Анализ сбалансированности\n",
"analyze_balance(y_train, y_val, y_test)" "analyze_balance(y_train, y_val, y_test)"
] ]
}, },
@@ -889,7 +828,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 546, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -907,10 +846,8 @@
"source": [ "source": [
"smote = SMOTE(random_state=42)\n", "smote = SMOTE(random_state=42)\n",
"\n", "\n",
"# Применение SMOTE для балансировки обучающей выборки\n",
"X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)\n", "X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)\n",
"\n", "\n",
"# Проверка сбалансированности после SMOTE\n",
"print(\"Сбалансированность обучающей выборки после SMOTE:\")\n", "print(\"Сбалансированность обучающей выборки после SMOTE:\")\n",
"print(y_train_resampled.value_counts(normalize=True))" "print(y_train_resampled.value_counts(normalize=True))"
] ]
@@ -938,7 +875,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 547, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -958,7 +895,6 @@
"from sklearn.model_selection import train_test_split\n", "from sklearn.model_selection import train_test_split\n",
"from imblearn.under_sampling import RandomUnderSampler\n", "from imblearn.under_sampling import RandomUnderSampler\n",
"\n", "\n",
"# вывод всех столбцов\n",
"df = pd.read_csv(\"..//..//static//csv//diabetes.csv\")\n", "df = pd.read_csv(\"..//..//static//csv//diabetes.csv\")\n",
"print(df.columns)" "print(df.columns)"
] ]
@@ -1009,7 +945,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 548, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -1033,10 +969,8 @@
} }
], ],
"source": [ "source": [
"# Выбираем столбцы для анализа\n",
"columns_to_check = ['Age', 'BloodPressure', 'BMI']\n", "columns_to_check = ['Age', 'BloodPressure', 'BMI']\n",
"\n", "\n",
"# Функция для подсчета выбросов\n",
"def count_outliers(df, columns):\n", "def count_outliers(df, columns):\n",
" outliers_count = {}\n", " outliers_count = {}\n",
" for col in columns:\n", " for col in columns:\n",
@@ -1046,20 +980,16 @@
" lower_bound = Q1 - 1.5 * IQR\n", " lower_bound = Q1 - 1.5 * IQR\n",
" upper_bound = Q3 + 1.5 * IQR\n", " upper_bound = Q3 + 1.5 * IQR\n",
" \n", " \n",
" # Считаем количество выбросов\n",
" outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]\n", " outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]\n",
" outliers_count[col] = len(outliers)\n", " outliers_count[col] = len(outliers)\n",
" \n", " \n",
" return outliers_count\n", " return outliers_count\n",
"\n", "\n",
"# Подсчитываем выбросы\n",
"outliers_count = count_outliers(df, columns_to_check)\n", "outliers_count = count_outliers(df, columns_to_check)\n",
"\n", "\n",
"# Выводим количество выбросов для каждого столбца\n",
"for col, count in outliers_count.items():\n", "for col, count in outliers_count.items():\n",
" print(f\"Количество выбросов в столбце '{col}': {count}\")\n", " print(f\"Количество выбросов в столбце '{col}': {count}\")\n",
"\n", "\n",
"# Создаем диаграммы размахов\n",
"plt.figure(figsize=(15, 10))\n", "plt.figure(figsize=(15, 10))\n",
"for i, col in enumerate(columns_to_check, 1):\n", "for i, col in enumerate(columns_to_check, 1):\n",
" plt.subplot(2, 2, i)\n", " plt.subplot(2, 2, i)\n",
@@ -1078,7 +1008,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 549, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -1100,10 +1030,8 @@
} }
], ],
"source": [ "source": [
"# Выбираем столбцы для очистки\n",
"columns_to_clean = ['Age', 'BloodPressure', 'BMI']\n", "columns_to_clean = ['Age', 'BloodPressure', 'BMI']\n",
"\n", "\n",
"# Функция для удаления выбросов\n",
"def remove_outliers(df, columns):\n", "def remove_outliers(df, columns):\n",
" for col in columns:\n", " for col in columns:\n",
" Q1 = df[col].quantile(0.25)\n", " Q1 = df[col].quantile(0.25)\n",
@@ -1112,33 +1040,26 @@
" lower_bound = Q1 - 1.5 * IQR\n", " lower_bound = Q1 - 1.5 * IQR\n",
" upper_bound = Q3 + 1.5 * IQR\n", " upper_bound = Q3 + 1.5 * IQR\n",
" \n", " \n",
" # Удаляем строки, содержащие выбросы\n",
" df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]\n", " df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]\n",
" \n", " \n",
" return df\n", " return df\n",
"\n", "\n",
"# Удаляем выбросы\n",
"df_cleaned = remove_outliers(df, columns_to_clean)\n", "df_cleaned = remove_outliers(df, columns_to_clean)\n",
"\n", "\n",
"# Выводим количество удаленных строк\n",
"print(f\"Количество удаленных строк: {len(df) - len(df_cleaned)}\")\n", "print(f\"Количество удаленных строк: {len(df) - len(df_cleaned)}\")\n",
"\n", "\n",
"# Создаем диаграммы размаха для очищенных данных\n",
"plt.figure(figsize=(15, 6))\n", "plt.figure(figsize=(15, 6))\n",
"\n", "\n",
"# Диаграмма размаха для Age\n",
"plt.subplot(1, 3, 1)\n", "plt.subplot(1, 3, 1)\n",
"sns.boxplot(x=df_cleaned['Age'])\n", "sns.boxplot(x=df_cleaned['Age'])\n",
"plt.title('Box Plot of Age (Cleaned)')\n", "plt.title('Box Plot of Age (Cleaned)')\n",
"plt.xlabel('Age')\n", "plt.xlabel('Age')\n",
"\n", "\n",
"# Диаграмма размаха для BloodPressure\n",
"plt.subplot(1, 3, 2)\n", "plt.subplot(1, 3, 2)\n",
"sns.boxplot(x=df_cleaned['BloodPressure'])\n", "sns.boxplot(x=df_cleaned['BloodPressure'])\n",
"plt.title('Box Plot of BloodPressure (Cleaned)')\n", "plt.title('Box Plot of BloodPressure (Cleaned)')\n",
"plt.xlabel('BloodPressure')\n", "plt.xlabel('BloodPressure')\n",
"\n", "\n",
"# Диаграмма размаха для BMI\n",
"plt.subplot(1, 3, 3)\n", "plt.subplot(1, 3, 3)\n",
"sns.boxplot(x=df_cleaned['BMI'])\n", "sns.boxplot(x=df_cleaned['BMI'])\n",
"plt.title('Box Plot of BMI (Cleaned)')\n", "plt.title('Box Plot of BMI (Cleaned)')\n",
@@ -1147,7 +1068,6 @@
"plt.tight_layout()\n", "plt.tight_layout()\n",
"plt.show()\n", "plt.show()\n",
"\n", "\n",
"# Сохраняем очищенный датасет\n",
"df_cleaned.to_csv(\"..//..//static//csv//diabetes_cleaned.csv\", index=False)\n", "df_cleaned.to_csv(\"..//..//static//csv//diabetes_cleaned.csv\", index=False)\n",
"df = df_cleaned" "df = df_cleaned"
] ]
@@ -1163,7 +1083,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 550, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -1196,17 +1116,14 @@
} }
], ],
"source": [ "source": [
"# Количество пустых значений признаков\n",
"print(df.isnull().sum())\n", "print(df.isnull().sum())\n",
"\n", "\n",
"print()\n", "print()\n",
"\n", "\n",
"# Есть ли пустые значения признаков\n",
"print(df.isnull().any())\n", "print(df.isnull().any())\n",
"\n", "\n",
"print()\n", "print()\n",
"\n", "\n",
"# Процент пустых значений признаков\n",
"for i in df.columns:\n", "for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n", " null_rate = df[i].isnull().sum() / len(df) * 100\n",
" if null_rate > 0:\n", " if null_rate > 0:\n",
@@ -1224,7 +1141,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 551, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -1238,18 +1155,13 @@
} }
], ],
"source": [ "source": [
"# Разделение на признаки (X) и целевую переменную (y)\n",
"# Предположим, что Outcome - это целевая переменная\n",
"X = df.drop('Outcome', axis=1)\n", "X = df.drop('Outcome', axis=1)\n",
"y = df['Outcome']\n", "y = df['Outcome']\n",
"\n", "\n",
"# Разбиение на обучающую и остальную выборку (контрольную + тестовую)\n",
"X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, random_state=42)\n", "X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, random_state=42)\n",
"\n", "\n",
"# Разбиение остатка на контрольную и тестовую выборки\n",
"X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=42)\n", "X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=42)\n",
"\n", "\n",
"# Вывод размеров выборок\n",
"print(\"Размер обучающей выборки:\", X_train.shape)\n", "print(\"Размер обучающей выборки:\", X_train.shape)\n",
"print(\"Размер контрольной выборки:\", X_val.shape)\n", "print(\"Размер контрольной выборки:\", X_val.shape)\n",
"print(\"Размер тестовой выборки:\", X_test.shape)" "print(\"Размер тестовой выборки:\", X_test.shape)"
@@ -1257,7 +1169,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 552, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -1285,7 +1197,6 @@
} }
], ],
"source": [ "source": [
"# Функция для анализа сбалансированности\n",
"def analyze_balance(y_train, y_val, y_test):\n", "def analyze_balance(y_train, y_val, y_test):\n",
" print(\"Распределение классов в обучающей выборке:\")\n", " print(\"Распределение классов в обучающей выборке:\")\n",
" print(y_train.value_counts(normalize=True))\n", " print(y_train.value_counts(normalize=True))\n",
@@ -1296,7 +1207,6 @@
" print(\"\\nРаспределение классов в тестовой выборке:\")\n", " print(\"\\nРаспределение классов в тестовой выборке:\")\n",
" print(y_test.value_counts(normalize=True))\n", " print(y_test.value_counts(normalize=True))\n",
"\n", "\n",
"# Анализ сбалансированности\n",
"analyze_balance(y_train, y_val, y_test)" "analyze_balance(y_train, y_val, y_test)"
] ]
}, },

View File

@@ -204,7 +204,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 137, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -244,17 +244,14 @@
} }
], ],
"source": [ "source": [
"# Количество пустых значений признаков\n",
"print(df.isnull().sum())\n", "print(df.isnull().sum())\n",
"\n", "\n",
"print()\n", "print()\n",
"\n", "\n",
"# Есть ли пустые значения признаков\n",
"print(df.isnull().any())\n", "print(df.isnull().any())\n",
"\n", "\n",
"print()\n", "print()\n",
"\n", "\n",
"# Процент пустых значений признаков\n",
"for i in df.columns:\n", "for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n", " null_rate = df[i].isnull().sum() / len(df) * 100\n",
" if null_rate > 0:\n", " if null_rate > 0:\n",
@@ -270,7 +267,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 138, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -296,13 +293,10 @@
} }
], ],
"source": [ "source": [
"# Замена значений\n",
"df[\"bmi\"] = df[\"bmi\"].fillna(df[\"bmi\"].median())\n", "df[\"bmi\"] = df[\"bmi\"].fillna(df[\"bmi\"].median())\n",
"\n", "\n",
"# Проверка на пропущенные значения после замены\n",
"missing_values_after_drop = df.isnull().sum()\n", "missing_values_after_drop = df.isnull().sum()\n",
"\n", "\n",
"# Вывод результатов после замены\n",
"print(\"\\nКоличество пустых значений в каждом столбце после замены:\")\n", "print(\"\\nКоличество пустых значений в каждом столбце после замены:\")\n",
"print(missing_values_after_drop)" "print(missing_values_after_drop)"
] ]
@@ -344,7 +338,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 140, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -358,19 +352,13 @@
} }
], ],
"source": [ "source": [
"# Разделение данных на признаки (X) и целевую переменную (y)\n",
"# В данном случае мы хотим предсказать 'stroke'\n",
"X = df.drop(columns=['stroke'])\n", "X = df.drop(columns=['stroke'])\n",
"y = df['stroke']\n", "y = df['stroke']\n",
"\n", "\n",
"# Разбиение данных на обучающую и тестовую выборки\n",
"# Сначала разделим на обучающую и тестовую\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)\n",
"\n", "\n",
"# Затем разделим обучающую выборку на обучающую и контрольную\n",
"X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3)\n", "X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3)\n",
"\n", "\n",
"# Проверка размеров выборок\n",
"print(\"Размер обучающей выборки:\", X_train.shape)\n", "print(\"Размер обучающей выборки:\", X_train.shape)\n",
"print(\"Размер контрольной выборки:\", X_val.shape)\n", "print(\"Размер контрольной выборки:\", X_val.shape)\n",
"print(\"Размер тестовой выборки:\", X_test.shape)" "print(\"Размер тестовой выборки:\", X_test.shape)"
@@ -385,7 +373,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 141, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -423,9 +411,7 @@
} }
], ],
"source": [ "source": [
"# Функция для анализа сбалансированности\n",
"def analyze_balance(y_train, y_val, y_test, y_name):\n", "def analyze_balance(y_train, y_val, y_test, y_name):\n",
" # Распределение классов\n",
" print(\"Распределение классов в обучающей выборке:\")\n", " print(\"Распределение классов в обучающей выборке:\")\n",
" print(y_train.value_counts(normalize=True))\n", " print(y_train.value_counts(normalize=True))\n",
" \n", " \n",
@@ -435,22 +421,18 @@
" print(\"\\nРаспределение классов в тестовой выборке:\")\n", " print(\"\\nРаспределение классов в тестовой выборке:\")\n",
" print(y_test.value_counts(normalize=True))\n", " print(y_test.value_counts(normalize=True))\n",
"\n", "\n",
" # Создание фигуры и осей для трех столбчатых диаграмм\n",
" fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=True)\n", " fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=True)\n",
" fig.suptitle('Распределение в различных выборках')\n", " fig.suptitle('Распределение в различных выборках')\n",
"\n", "\n",
" # Обучающая выборка\n",
" sns.barplot(x=y_train.value_counts().index, y=y_train.value_counts(normalize=True), ax=axes[0])\n", " sns.barplot(x=y_train.value_counts().index, y=y_train.value_counts(normalize=True), ax=axes[0])\n",
" axes[0].set_title('Обучающая выборка')\n", " axes[0].set_title('Обучающая выборка')\n",
" axes[0].set_xlabel(y_name)\n", " axes[0].set_xlabel(y_name)\n",
" axes[0].set_ylabel('Доля')\n", " axes[0].set_ylabel('Доля')\n",
"\n", "\n",
" # Контрольная выборка\n",
" sns.barplot(x=y_val.value_counts().index, y=y_val.value_counts(normalize=True), ax=axes[1])\n", " sns.barplot(x=y_val.value_counts().index, y=y_val.value_counts(normalize=True), ax=axes[1])\n",
" axes[1].set_title('Контрольная выборка')\n", " axes[1].set_title('Контрольная выборка')\n",
" axes[1].set_xlabel(y_name)\n", " axes[1].set_xlabel(y_name)\n",
"\n", "\n",
" # Тестовая выборка\n",
" sns.barplot(x=y_test.value_counts().index, y=y_test.value_counts(normalize=True), ax=axes[2])\n", " sns.barplot(x=y_test.value_counts().index, y=y_test.value_counts(normalize=True), ax=axes[2])\n",
" axes[2].set_title('Тестовая выборка')\n", " axes[2].set_title('Тестовая выборка')\n",
" axes[2].set_xlabel(y_name)\n", " axes[2].set_xlabel(y_name)\n",
@@ -469,7 +451,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 142, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -509,11 +491,9 @@
"source": [ "source": [
"ros = RandomOverSampler(random_state=42)\n", "ros = RandomOverSampler(random_state=42)\n",
"\n", "\n",
"# Применение RandomOverSampler для балансировки выборок\n",
"X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n", "X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n",
"X_val_resampled, y_val_resampled = ros.fit_resample(X_val, y_val)\n", "X_val_resampled, y_val_resampled = ros.fit_resample(X_val, y_val)\n",
"\n", "\n",
"# Проверка сбалансированности после RandomOverSampler\n",
"analyze_balance(y_train_resampled, y_val_resampled, y_test, 'stroke')" "analyze_balance(y_train_resampled, y_val_resampled, y_test, 'stroke')"
] ]
}, },
@@ -530,7 +510,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 143, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -575,16 +555,12 @@
} }
], ],
"source": [ "source": [
"# Определение категориальных признаков\n",
"categorical_features = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']\n", "categorical_features = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']\n",
"\n", "\n",
"# Применение one-hot encoding к обучающей выборке\n",
"X_train_encoded = pd.get_dummies(X_train_resampled, columns=categorical_features, drop_first=True)\n", "X_train_encoded = pd.get_dummies(X_train_resampled, columns=categorical_features, drop_first=True)\n",
"\n", "\n",
"# Применение one-hot encoding к контрольной выборке\n",
"X_val_encoded = pd.get_dummies(X_val_resampled, columns=categorical_features, drop_first=True)\n", "X_val_encoded = pd.get_dummies(X_val_resampled, columns=categorical_features, drop_first=True)\n",
"\n", "\n",
"# Применение one-hot encoding к тестовой выборке\n",
"X_test_encoded = pd.get_dummies(X_test, columns=categorical_features, drop_first=True)\n", "X_test_encoded = pd.get_dummies(X_test, columns=categorical_features, drop_first=True)\n",
"\n", "\n",
"print(X_train_encoded.head())" "print(X_train_encoded.head())"
@@ -599,7 +575,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 144, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -644,21 +620,17 @@
} }
], ],
"source": [ "source": [
"# Определение числовых признаков для дискретизации\n",
"numerical_features = ['age']\n", "numerical_features = ['age']\n",
"\n", "\n",
"# Функция для дискретизации числовых признаков\n",
"def discretize_features(df, features, bins, labels):\n", "def discretize_features(df, features, bins, labels):\n",
" for feature in features:\n", " for feature in features:\n",
" df[f'{feature}_bin'] = pd.cut(df[feature], bins=bins, labels=labels)\n", " df[f'{feature}_bin'] = pd.cut(df[feature], bins=bins, labels=labels)\n",
" df.drop(columns=[feature], inplace=True)\n", " df.drop(columns=[feature], inplace=True)\n",
" return df\n", " return df\n",
"\n", "\n",
"# Заданные интервалы и метки\n",
"age_bins = [0, 25, 55, 100]\n", "age_bins = [0, 25, 55, 100]\n",
"age_labels = [\"young\", \"middle-aged\", \"old\"]\n", "age_labels = [\"young\", \"middle-aged\", \"old\"]\n",
"\n", "\n",
"# Применение дискретизации к обучающей, контрольной и тестовой выборкам\n",
"X_train_encoded = discretize_features(X_train_encoded, numerical_features, bins=age_bins, labels=age_labels)\n", "X_train_encoded = discretize_features(X_train_encoded, numerical_features, bins=age_bins, labels=age_labels)\n",
"X_val_encoded = discretize_features(X_val_encoded, numerical_features, bins=age_bins, labels=age_labels)\n", "X_val_encoded = discretize_features(X_val_encoded, numerical_features, bins=age_bins, labels=age_labels)\n",
"X_test_encoded = discretize_features(X_test_encoded, numerical_features, bins=age_bins, labels=age_labels)\n", "X_test_encoded = discretize_features(X_test_encoded, numerical_features, bins=age_bins, labels=age_labels)\n",
@@ -741,7 +713,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 146, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -786,7 +758,6 @@
} }
], ],
"source": [ "source": [
"# Пример масштабирования числовых признаков\n",
"numerical_features = ['avg_glucose_level', 'bmi', 'glucose_age_deviation']\n", "numerical_features = ['avg_glucose_level', 'bmi', 'glucose_age_deviation']\n",
"\n", "\n",
"scaler = StandardScaler()\n", "scaler = StandardScaler()\n",
@@ -806,7 +777,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 147, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -872,7 +843,7 @@
} }
], ],
"source": [ "source": [
"data = X_train_encoded.copy() # Используем предобработанные данные\n", "data = X_train_encoded.copy()\n",
"\n", "\n",
"es = ft.EntitySet(id=\"patients\")\n", "es = ft.EntitySet(id=\"patients\")\n",
"\n", "\n",
@@ -918,7 +889,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 148, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -930,23 +901,15 @@
} }
], ],
"source": [ "source": [
"X_train_encoded = pd.get_dummies(X_train_encoded, drop_first=True)\n",
"X_val_encoded = pd.get_dummies(X_val_encoded, drop_first=True)\n",
"X_test_encoded = pd.get_dummies(X_test_encoded, drop_first=True)\n",
"\n",
"all_columns = X_train_encoded.columns\n", "all_columns = X_train_encoded.columns\n",
"X_train_encoded = X_train_encoded.reindex(columns=all_columns, fill_value=0)\n", "X_train_encoded = X_train_encoded.reindex(columns=all_columns, fill_value=0)\n",
"X_val_encoded = X_val_encoded.reindex(columns=all_columns, fill_value=0)\n", "X_val_encoded = X_val_encoded.reindex(columns=all_columns, fill_value=0)\n",
"X_test_encoded = X_test_encoded.reindex(columns=all_columns, fill_value=0)\n", "X_test_encoded = X_test_encoded.reindex(columns=all_columns, fill_value=0)\n",
"\n", "\n",
"# Выбор модели\n",
"model = RandomForestClassifier(n_estimators=100, random_state=42)\n", "model = RandomForestClassifier(n_estimators=100, random_state=42)\n",
"\n", "\n",
"# Начинаем отсчет времени\n",
"start_time = time.time()\n", "start_time = time.time()\n",
"model.fit(X_train_encoded, y_train_resampled)\n", "model.fit(X_train_encoded, y_train_resampled)\n",
"\n",
"# Время обучения модели\n",
"train_time = time.time() - start_time\n", "train_time = time.time() - start_time\n",
"\n", "\n",
"print(f'Время обучения модели: {train_time:.2f} секунд')" "print(f'Время обучения модели: {train_time:.2f} секунд')"
@@ -954,7 +917,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 149, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -985,11 +948,9 @@
} }
], ],
"source": [ "source": [
"# Получение важности признаков\n",
"importances = model.feature_importances_\n", "importances = model.feature_importances_\n",
"feature_names = X_train_encoded.columns\n", "feature_names = X_train_encoded.columns\n",
"\n", "\n",
"# Сортировка признаков по важности\n",
"feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importances})\n", "feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importances})\n",
"feature_importance = feature_importance.sort_values(by='importance', ascending=False)\n", "feature_importance = feature_importance.sort_values(by='importance', ascending=False)\n",
"\n", "\n",
@@ -999,7 +960,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 150, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -1037,7 +998,6 @@
} }
], ],
"source": [ "source": [
"# Предсказание и оценка\n",
"y_pred = model.predict(X_test_encoded)\n", "y_pred = model.predict(X_test_encoded)\n",
"\n", "\n",
"accuracy = accuracy_score(y_test, y_pred)\n", "accuracy = accuracy_score(y_test, y_pred)\n",
@@ -1052,12 +1012,10 @@
"print(f\"F1 Score: {f1}\")\n", "print(f\"F1 Score: {f1}\")\n",
"print(f\"ROC AUC: {roc_auc}\")\n", "print(f\"ROC AUC: {roc_auc}\")\n",
"\n", "\n",
"# Кросс-валидация\n",
"scores = cross_val_score(model, X_train_encoded, y_train_resampled, cv=5, scoring='accuracy')\n", "scores = cross_val_score(model, X_train_encoded, y_train_resampled, cv=5, scoring='accuracy')\n",
"accuracy_cv = scores.mean()\n", "accuracy_cv = scores.mean()\n",
"print(f\"Cross-validated Accuracy: {accuracy_cv}\")\n", "print(f\"Cross-validated Accuracy: {accuracy_cv}\")\n",
"\n", "\n",
"# Анализ важности признаков\n",
"feature_importances = model.feature_importances_\n", "feature_importances = model.feature_importances_\n",
"feature_names = X_train_encoded.columns\n", "feature_names = X_train_encoded.columns\n",
"\n", "\n",
@@ -1069,7 +1027,6 @@
"plt.title('Feature Importance')\n", "plt.title('Feature Importance')\n",
"plt.show()\n", "plt.show()\n",
"\n", "\n",
"# Проверка на переобучение\n",
"y_train_pred = model.predict(X_train_encoded)\n", "y_train_pred = model.predict(X_train_encoded)\n",
"\n", "\n",
"accuracy_train = accuracy_score(y_train_resampled, y_train_pred)\n", "accuracy_train = accuracy_score(y_train_resampled, y_train_pred)\n",

View File

@@ -1153,7 +1153,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 86, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -1171,13 +1171,10 @@
"import numpy as np\n", "import numpy as np\n",
"from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score\n", "from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score\n",
"\n", "\n",
"# Получаем уникальные классы для целевого признака из тренировочного набора данных\n",
"unique_classes = np.unique(y_train)\n", "unique_classes = np.unique(y_train)\n",
"\n", "\n",
"# Генерируем случайные предсказания, выбирая случайное значение из области значений целевого признака\n",
"random_predictions = np.random.choice(unique_classes, size=len(y_test))\n", "random_predictions = np.random.choice(unique_classes, size=len(y_test))\n",
"\n", "\n",
"# Вычисление метрик для ориентира\n",
"baseline_accuracy = accuracy_score(y_test, random_predictions)\n", "baseline_accuracy = accuracy_score(y_test, random_predictions)\n",
"baseline_precision = precision_score(y_test, random_predictions)\n", "baseline_precision = precision_score(y_test, random_predictions)\n",
"baseline_recall = recall_score(y_test, random_predictions)\n", "baseline_recall = recall_score(y_test, random_predictions)\n",
@@ -1624,7 +1621,6 @@
"from sklearn.model_selection import GridSearchCV\n", "from sklearn.model_selection import GridSearchCV\n",
"from sklearn import neighbors, ensemble, neural_network\n", "from sklearn import neighbors, ensemble, neural_network\n",
"\n", "\n",
"# Словарь с вариантами гиперпараметров для каждой модели\n",
"param_grids = {\n", "param_grids = {\n",
" \"knn\": {\n", " \"knn\": {\n",
" \"n_neighbors\": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], \n", " \"n_neighbors\": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], \n",
@@ -1648,22 +1644,17 @@
" }\n", " }\n",
"}\n", "}\n",
"\n", "\n",
"# Создаем экземпляры моделей\n",
"models = {\n", "models = {\n",
" \"knn\": neighbors.KNeighborsClassifier(),\n", " \"knn\": neighbors.KNeighborsClassifier(),\n",
" \"random_forest\": ensemble.RandomForestClassifier(),\n", " \"random_forest\": ensemble.RandomForestClassifier(),\n",
" \"mlp\": neural_network.MLPClassifier()\n", " \"mlp\": neural_network.MLPClassifier()\n",
"}\n", "}\n",
"\n", "\n",
"# Словарь для хранения моделей с их лучшими параметрами\n",
"class_models = {}\n", "class_models = {}\n",
"\n", "\n",
"# Выполнение поиска по сетке для каждой модели\n",
"for model_name, model in models.items():\n", "for model_name, model in models.items():\n",
" # Создаем GridSearchCV для текущей модели\n",
" gs_optimizer = GridSearchCV(estimator=model, param_grid=param_grids[model_name], scoring=\"f1\", n_jobs=-1)\n", " gs_optimizer = GridSearchCV(estimator=model, param_grid=param_grids[model_name], scoring=\"f1\", n_jobs=-1)\n",
" \n", " \n",
" # Обучаем GridSearchCV\n",
" gs_optimizer.fit(preprocessed_df, y_train.values.ravel())\n", " gs_optimizer.fit(preprocessed_df, y_train.values.ravel())\n",
" \n", " \n",
" # Получаем лучшие параметры\n", " # Получаем лучшие параметры\n",
@@ -1671,7 +1662,7 @@
" print(f\"Лучшие параметры для {model_name}: {best_params}\")\n", " print(f\"Лучшие параметры для {model_name}: {best_params}\")\n",
" \n", " \n",
" class_models[model_name] = {\n", " class_models[model_name] = {\n",
" \"model\": model.set_params(**best_params) # Настраиваем модель с лучшими параметрами\n", " \"model\": model.set_params(**best_params) \n",
" }" " }"
] ]
}, },
@@ -2586,7 +2577,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 94, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -2603,10 +2594,8 @@
"import math\n", "import math\n",
"from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n", "from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n",
"\n", "\n",
"# Базовое предсказание: среднее значение по y_train\n",
"baseline_predictions = [y_train.mean()] * len(y_test)\n", "baseline_predictions = [y_train.mean()] * len(y_test)\n",
"\n", "\n",
"# Вычисление метрик качества для ориентира\n",
"baseline_rmse = math.sqrt(\n", "baseline_rmse = math.sqrt(\n",
" mean_squared_error(y_test, baseline_predictions)\n", " mean_squared_error(y_test, baseline_predictions)\n",
" )\n", " )\n",
@@ -3111,7 +3100,6 @@
} }
], ],
"source": [ "source": [
"# Словарь с вариантами гиперпараметров для каждой модели\n",
"param_grids = {\n", "param_grids = {\n",
" \"knn\": {\n", " \"knn\": {\n",
" \"n_neighbors\": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], \n", " \"n_neighbors\": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], \n",
@@ -3136,30 +3124,24 @@
" }\n", " }\n",
"}\n", "}\n",
"\n", "\n",
"# Создаем экземпляры моделей\n",
"models = {\n", "models = {\n",
" \"knn\": neighbors.KNeighborsRegressor(),\n", " \"knn\": neighbors.KNeighborsRegressor(),\n",
" \"random_forest\": ensemble.RandomForestRegressor(),\n", " \"random_forest\": ensemble.RandomForestRegressor(),\n",
" \"mlp\": neural_network.MLPRegressor()\n", " \"mlp\": neural_network.MLPRegressor()\n",
"}\n", "}\n",
"\n", "\n",
"# Словарь для хранения моделей с их лучшими параметрами\n",
"class_models = {}\n", "class_models = {}\n",
"\n", "\n",
"# Выполнение поиска по сетке для каждой модели\n",
"for model_name, model in models.items():\n", "for model_name, model in models.items():\n",
" # Создаем GridSearchCV для текущей модели\n",
" gs_optimizer = GridSearchCV(estimator=model, param_grid=param_grids[model_name], scoring='neg_mean_squared_error', n_jobs=-1)\n", " gs_optimizer = GridSearchCV(estimator=model, param_grid=param_grids[model_name], scoring='neg_mean_squared_error', n_jobs=-1)\n",
" \n", " \n",
" # Обучаем GridSearchCV\n",
" gs_optimizer.fit(preprocessed_df, y_train.values.ravel())\n", " gs_optimizer.fit(preprocessed_df, y_train.values.ravel())\n",
" \n", " \n",
" # Получаем лучшие параметры\n",
" best_params = gs_optimizer.best_params_\n", " best_params = gs_optimizer.best_params_\n",
" print(f\"Лучшие параметры для {model_name}: {best_params}\")\n", " print(f\"Лучшие параметры для {model_name}: {best_params}\")\n",
" \n", " \n",
" class_models[model_name] = {\n", " class_models[model_name] = {\n",
" \"model\": model.set_params(**best_params) # Настраиваем модель с лучшими параметрами\n", " \"model\": model.set_params(**best_params)\n",
" }" " }"
] ]
}, },
@@ -3323,7 +3305,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 100, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -3379,7 +3361,6 @@
} }
], ],
"source": [ "source": [
"# Создаем графики для всех моделей\n",
"for model_name, model_data in class_models.items():\n", "for model_name, model_data in class_models.items():\n",
" print(f\"Model: {model_name}\")\n", " print(f\"Model: {model_name}\")\n",
" y_pred = model_data[\"preds\"]\n", " y_pred = model_data[\"preds\"]\n",

View File

@@ -308,7 +308,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 353, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -348,17 +348,14 @@
} }
], ],
"source": [ "source": [
"# Количество пустых значений признаков\n",
"print(df.isnull().sum())\n", "print(df.isnull().sum())\n",
"\n", "\n",
"print()\n", "print()\n",
"\n", "\n",
"# Есть ли пустые значения признаков\n",
"print(df.isnull().any())\n", "print(df.isnull().any())\n",
"\n", "\n",
"print()\n", "print()\n",
"\n", "\n",
"# Процент пустых значений признаков\n",
"for i in df.columns:\n", "for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n", " null_rate = df[i].isnull().sum() / len(df) * 100\n",
" if null_rate > 0:\n", " if null_rate > 0:\n",
@@ -367,11 +364,10 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 354, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Замена значений\n",
"df[\"bmi\"] = df[\"bmi\"].fillna(df[\"bmi\"].median())" "df[\"bmi\"] = df[\"bmi\"].fillna(df[\"bmi\"].median())"
] ]
}, },

View File

@@ -114,7 +114,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -122,15 +122,9 @@
"import emoji\n", "import emoji\n",
"from num2words import num2words\n", "from num2words import num2words\n",
"\n", "\n",
"# Функция для преобразования эмоджи в слова\n",
"def emojis_words(text):\n", "def emojis_words(text):\n",
" \n",
" # Модуль emoji: преобразование эмоджи в их словесные описания\n",
" text = emoji.demojize(text, delimiters=(\" \", \" \"))\n", " text = emoji.demojize(text, delimiters=(\" \", \" \"))\n",
" \n",
" # Редактирование текста путём замены \":\" и\" _\", а так же - путём добавления пробела между отдельными словами\n",
" text = text.replace(\":\", \"\").replace(\"_\", \" \")\n", " text = text.replace(\":\", \"\").replace(\"_\", \" \")\n",
" \n",
" return text\n", " return text\n",
"\n", "\n",
"def transform_text(text):\n", "def transform_text(text):\n",
@@ -140,22 +134,17 @@
" # Удаление из текста всех URL и ссылок\n", " # Удаление из текста всех URL и ссылок\n",
" text = re.sub(r'http\\S+', '', text)\n", " text = re.sub(r'http\\S+', '', text)\n",
"\n", "\n",
" # Преобразование эмоджи в текст\n",
" text = emojis_words(text)\n", " text = emojis_words(text)\n",
"\n", "\n",
" # Приведение к нижнему регистру\n",
" text = text.lower()\n", " text = text.lower()\n",
"\n", "\n",
" # Удаление лишних пробелов\n",
" text = re.sub(r'\\s+', ' ', text) \n", " text = re.sub(r'\\s+', ' ', text) \n",
" \n", " \n",
" # Преобразование \"ё\" в \"е\"\n",
" text = text.replace(\"ё\", \"е\")\n", " text = text.replace(\"ё\", \"е\")\n",
"\n", "\n",
" # Удаление всех специальных символов\n", " # Удаление всех специальных символов\n",
" text = re.sub(r'[^a-zA-Zа-яА-Я0-9\\s]', '', text)\n", " text = re.sub(r'[^a-zA-Zа-яА-Я0-9\\s]', '', text)\n",
"\n", "\n",
" # Преобразование чисел в слова\n",
" words: list[str] = text.split()\n", " words: list[str] = text.split()\n",
" words = [num2words(word, lang=\"ru\") if word.isdigit() else word for word in words]\n", " words = [num2words(word, lang=\"ru\") if word.isdigit() else word for word in words]\n",
" text = \" \".join(words)\n", " text = \" \".join(words)\n",
@@ -177,7 +166,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -197,16 +186,15 @@
" doc = sp(text)\n", " doc = sp(text)\n",
" \n", " \n",
" filtered_tokens = [\n", " filtered_tokens = [\n",
" f\"{token.lemma_}_{token.pos_}_{token.morph}\" # Формирование строки с нужным форматом\n", " f\"{token.lemma_}_{token.pos_}_{token.morph}\"\n",
" for token in doc\n", " for token in doc\n",
" if token.text not in stop_words and len(token.text) <= 20 # Фильтрация \n", " if token.text not in stop_words and len(token.text) <= 20 \n",
" ]\n", " ]\n",
" \n", " \n",
" return \" \".join(filtered_tokens)\n", " return \" \".join(filtered_tokens)\n",
"\n", "\n",
"df[\"preprocessed_text\"] = df[\"preprocessed_text\"].apply(preprocess_text)\n", "df[\"preprocessed_text\"] = df[\"preprocessed_text\"].apply(preprocess_text)\n",
"\n", "\n",
"# Выведем 10 токенов из первого текста\n",
"first_text_tokens = df[\"preprocessed_text\"].iloc[0].split()[:10]\n", "first_text_tokens = df[\"preprocessed_text\"].iloc[0].split()[:10]\n",
"print(\" \".join(first_text_tokens))" "print(\" \".join(first_text_tokens))"
] ]
@@ -220,7 +208,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -299,10 +287,8 @@
" n_grams: list[tuple] = list(ngrams(tokens, n))\n", " n_grams: list[tuple] = list(ngrams(tokens, n))\n",
" return n_grams\n", " return n_grams\n",
"\n", "\n",
"# Пример для биграмм (N=2)\n",
"df[\"bigrams\"] = df[\"preprocessed_text\"].apply(lambda x: generate_ngrams(x, n=2))\n", "df[\"bigrams\"] = df[\"preprocessed_text\"].apply(lambda x: generate_ngrams(x, n=2))\n",
"\n", "\n",
"# Пример для триграмм (N=3)\n",
"df[\"trigrams\"] = df[\"preprocessed_text\"].apply(lambda x: generate_ngrams(x, n=3))\n", "df[\"trigrams\"] = df[\"preprocessed_text\"].apply(lambda x: generate_ngrams(x, n=3))\n",
"\n", "\n",
"print(df.iloc[15:25])" "print(df.iloc[15:25])"

View File

@@ -98,7 +98,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": null,
"id": "5b915c12", "id": "5b915c12",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -117,11 +117,8 @@
"def preprocess_images(images):\n", "def preprocess_images(images):\n",
" processed_images = []\n", " processed_images = []\n",
" for img in images:\n", " for img in images:\n",
" # Изменение размера\n",
" img_resized = cv2.resize(img, (128, 128))\n", " img_resized = cv2.resize(img, (128, 128))\n",
" # Преобразование в оттенки серого\n",
" img_gray = cv2.cvtColor(img_resized, cv2.COLOR_BGR2GRAY)\n", " img_gray = cv2.cvtColor(img_resized, cv2.COLOR_BGR2GRAY)\n",
" # Увеличение контраста с помощью выравнивания гистограммы\n",
" img_eq = cv2.equalizeHist(img_gray)\n", " img_eq = cv2.equalizeHist(img_gray)\n",
" processed_images.append(img_eq)\n", " processed_images.append(img_eq)\n",
" return np.array(processed_images)\n", " return np.array(processed_images)\n",
@@ -156,7 +153,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": null,
"id": "7cc2f6b2", "id": "7cc2f6b2",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -173,12 +170,9 @@
], ],
"source": [ "source": [
"def apply_filters(img):\n", "def apply_filters(img):\n",
" # Удаление шумов\n",
" img_blur = cv2.GaussianBlur(img, (5, 5), 0)\n", " img_blur = cv2.GaussianBlur(img, (5, 5), 0)\n",
" # Повышение резкости\n",
" kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])\n", " kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])\n",
" img_sharp = cv2.filter2D(img_blur, -1, kernel)\n", " img_sharp = cv2.filter2D(img_blur, -1, kernel)\n",
" # Определение границ\n",
" img_edges = cv2.Canny(img_sharp, 100, 200)\n", " img_edges = cv2.Canny(img_sharp, 100, 200)\n",
" return img_edges\n", " return img_edges\n",
"\n", "\n",