From 82e5462ffd142f0f4f11a00138bcb05881216573 Mon Sep 17 00:00:00 2001 From: GokaPek Date: Fri, 21 Feb 2025 23:02:39 +0400 Subject: [PATCH] =?UTF-8?q?=D0=BD=D0=B5=D0=BC=D0=BD=D0=BE=D0=B3=D0=BE=20?= =?UTF-8?q?=D0=BD=D0=B5=D1=80=D0=B0=D0=B1=D0=BE=D1=87=D0=B5=D0=B5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 4 +- lab_8/lab8.ipynb | 118 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 121 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index d229cc0..cf52170 100644 --- a/.gitignore +++ b/.gitignore @@ -178,4 +178,6 @@ cython_debug/ /lab_2/aimenv -/lab_3/aimenv \ No newline at end of file +/lab_3/aimenv + +lab_8/static \ No newline at end of file diff --git a/lab_8/lab8.ipynb b/lab_8/lab8.ipynb index e69de29..8b527f9 100644 --- a/lab_8/lab8.ipynb +++ b/lab_8/lab8.ipynb @@ -0,0 +1,118 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Загрузка датасета" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Загружено 41 документов.\n" + ] + } + ], + "source": [ + "import os\n", + "import win32com.client\n", + "\n", + "# Путь к папке с распакованными файлами\n", + "data_dir = r\"C:/Users/Egor/Desktop/ULSTU\\AI/aim/AIM-PIbd-32-Petrushin-E-A/lab_8/static\"\n", + "\n", + "# Инициализация Word\n", + "word = win32com.client.Dispatch(\"Word.Application\")\n", + "word.visible = False\n", + "\n", + "# Чтение всех .doc файлов\n", + "texts = []\n", + "for filename in os.listdir(data_dir):\n", + " if filename.endswith(\".doc\"):\n", + " file_path = os.path.join(data_dir, filename)\n", + " try:\n", + " doc = word.Documents.Open(file_path)\n", + " text = doc.Content.Text\n", + " texts.append(text)\n", + " doc.Close()\n", + " except Exception as e:\n", + " print(f\"Ошибка при чтении файла {filename}: {e}\")\n", + "\n", + "# Закрытие Word\n", + "word.Quit()\n", + "\n", + "# Теперь texts содержит список текстов\n", + "print(f\"Загружено {len(texts)} документов.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package stopwords to\n", + "[nltk_data] C:\\Users\\Egor\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Unzipping corpora\\stopwords.zip.\n", + "[nltk_data] Downloading package wordnet to\n", + "[nltk_data] C:\\Users\\Egor\\AppData\\Roaming\\nltk_data...\n" + ] + } + ], + "source": [ + "import re\n", + "from nltk.corpus import stopwords\n", + "from nltk.stem import WordNetLemmatizer\n", + "import nltk\n", + "\n", + "# Загрузка стоп-слов и лемматизатора\n", + "nltk.download('stopwords')\n", + "nltk.download('wordnet')\n", + "stop_words = set(stopwords.words('russian')) # Для русского языка\n", + "lemmatizer = WordNetLemmatizer()\n", + "\n", + "def preprocess_text(text):\n", + " # Удаление спецсимволов\n", + " text = re.sub(r'\\W', ' ', text)\n", + " # Приведение к нижнему регистру\n", + " text = text.lower()\n", + " # Удаление стоп-слов и лемматизация\n", + " tokens = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]\n", + " return ' '.join(tokens)\n", + "\n", + "# Применение предобработки к каждому документу\n", + "texts = [preprocess_text(text) for text in texts]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}