48 KiB
48 KiB
Лабораторная 8¶
Датасет: ТЗ и статьи по ИТ (кластеризация, классификация).
In [29]:
import spacy
sp = spacy.load("ru_core_news_lg")
Загружаем тексты из файлов датафрейм:
In [30]:
import pandas as pd
import pandas as pd
from docx import Document
import os
def read_docx(file_path):
doc = Document(file_path)
full_text = []
for paragraph in doc.paragraphs:
full_text.append(paragraph.text)
return "\n".join(full_text)
def load_docs(dataset_path):
df = pd.DataFrame(columns=["doc", "text"])
for file_path in os.listdir(dataset_path):
if file_path.startswith("~$"):
continue
text = read_docx(dataset_path + file_path)
df.loc[len(df.index)] = [file_path, text]
return df
df = load_docs("static/text/")
df["type"] = df.apply(
lambda row: 0 if str(row["doc"]).startswith("tz_") else 1, axis=1
)
df.info()
df.sort_values(by=["doc"], inplace=True)
display(df.head(), df.tail())
Предобработка текста
In [31]:
import re
from num2words import num2words
def transform_text(text):
text = re.sub(r'<[^<]+?>', '', text)
text = re.sub(r'http\S+', '', text)
text = text.lower()
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[^a-zA-Zа-яА-Я0-9\s]', '', text)
words: list[str] = text.split()
words = [num2words(word, lang="ru") if word.isdigit() else word for word in words]
text = " ".join(words)
text = re.sub(r'[^\w\s]', '', text)
return text
df["preprocessed_text"] = df["text"].apply(transform_text)
Токенизация, выделения частей речи, лемматизация и фильтрация
In [32]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('russian'))
def is_valid_token(token):
return token.text not in stop_words and len(token.text) <= 20
def preprocess_text(text):
doc = sp(text)
filtered_tokens = map(
lambda token: f"{token.lemma_}_{token.pos_}_{token.morph}",
filter(is_valid_token, doc)
)
return " ".join(filtered_tokens)
df["preprocessed_text"] = df["preprocessed_text"].apply(preprocess_text)
first_text_tokens = df["preprocessed_text"].iloc[0].split()[:10]
print(" ".join(first_text_tokens))
N-граммы
In [33]:
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
def generate_ngrams(text: str, n: int = 2) -> list[tuple]:
tokens: list[str] = word_tokenize(text, language="russian")
return list(ngrams(tokens, n))
df["bigrams"] = df["preprocessed_text"].apply(lambda x: generate_ngrams(x, n=2))
df["trigrams"] = df["preprocessed_text"].apply(lambda x: generate_ngrams(x, n=3))
df.head(10)
Out[33]:
Векторизация текста. Мешок слов:
In [34]:
from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer
counts_vectorizer = CountVectorizer()
counts_matrix = sparse.csr_matrix(counts_vectorizer.fit_transform(df["preprocessed_text"]))
counts_df = pd.DataFrame(
counts_matrix.toarray(),
columns=counts_vectorizer.get_feature_names_out(),
)
counts_df.head()
Out[34]:
Частотный портрет:
In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True)
tfidf_matrix = sparse.csr_matrix(tfidf_vectorizer.fit_transform(df["preprocessed_text"]))
tfidf_df = pd.DataFrame(
tfidf_matrix.toarray(),
columns=tfidf_vectorizer.get_feature_names_out(),
)
tfidf_df.head(10)
Out[35]:
Обучение и оценка модели:
In [43]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
def train_and_evaluate(X, y, test_size=0.2, cv=5, optimize=False):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
metrics = {
"Accuracy": accuracy_score(y_test, y_pred),
"Precision": precision_score(y_test, y_pred),
"Recall": recall_score(y_test, y_pred),
"F1 Score": f1_score(y_test, y_pred),
"ROC AUC": roc_auc_score(y_test, y_pred)
}
f1_cv = cross_val_score(model, X_train, y_train, cv=cv, scoring='f1').mean()
for metric, value in metrics.items():
print(f"{metric}: {value:.4f}")
print(f"Средний F1 Score (кросс-валидация): {f1_cv:.4f}")
return model
X_tfidf = tfidf_df
X_counts = counts_df
y = df["type"]
print("TF-IDF Model")
model_tfidf = train_and_evaluate(X_tfidf, y)
print("Count Vectorizer Model")
model_counts = train_and_evaluate(X_counts, y)
TF-IDF показывает признаки переобучения, так как результаты теста идеальны, а на кросс-валидации чуть хуже. Count Vectorizer более сбалансирован, но уступает по точности.