Красивые папочки. Жаль у меня нет папы :(
This commit is contained in:
parent
479392c652
commit
87246af738
115
genetic_algorithm/genetic_algorithm.py
Normal file
115
genetic_algorithm/genetic_algorithm.py
Normal file
@ -0,0 +1,115 @@
|
||||
import json
|
||||
from datetime import datetime, timedelta
|
||||
import random
|
||||
|
||||
|
||||
# Функция для загрузки данных из JSON-файла и преобразования в граф
|
||||
def load_graph_from_json(file_path):
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
data = json.load(file)
|
||||
|
||||
graph = {}
|
||||
start_point = data['from']
|
||||
end_point = data['to']
|
||||
|
||||
for flight in data['flights']:
|
||||
departure_point = flight['departurePoint']
|
||||
destination_point = flight['destinationPoint']
|
||||
distance = flight['distance']
|
||||
departure_time = datetime.fromisoformat(flight['departureTime'])
|
||||
destination_time = datetime.fromisoformat(flight['destinationTime'])
|
||||
|
||||
if departure_point not in graph:
|
||||
graph[departure_point] = {}
|
||||
|
||||
graph[departure_point][destination_point] = (distance, departure_time, destination_time)
|
||||
|
||||
return graph, start_point, end_point
|
||||
|
||||
|
||||
# Функция для вычисления длины и времени пути с учетом минимального интервала времени
|
||||
def path_length_and_time(path, graph):
|
||||
length = 0
|
||||
start_time = graph[path[0]][path[1]][1]
|
||||
end_time = start_time
|
||||
for i in range(len(path) - 1):
|
||||
if path[i] not in graph or path[i + 1] not in graph[path[i]]:
|
||||
return float('inf'), start_time, end_time
|
||||
length += graph[path[i]][path[i + 1]][0]
|
||||
if i > 0: # Пропуск первой вершины
|
||||
if end_time + timedelta(minutes=5) > graph[path[i]][path[i + 1]][1]:
|
||||
return float('inf'), start_time, end_time # Неприемлемый путь
|
||||
end_time = graph[path[i]][path[i + 1]][2]
|
||||
return length, start_time, end_time
|
||||
|
||||
|
||||
# Функция для генерации начальной популяции
|
||||
def generate_population(size, start, end, graph):
|
||||
population = []
|
||||
for _ in range(size):
|
||||
path = [start]
|
||||
while path[-1] != end:
|
||||
if path[-1] not in graph or not graph[path[-1]]:
|
||||
break
|
||||
next_node = random.choice(list(graph[path[-1]].keys()))
|
||||
if next_node not in path:
|
||||
path.append(next_node)
|
||||
if path[-1] == end:
|
||||
population.append(path)
|
||||
return population
|
||||
|
||||
|
||||
# Функция для селекции родителей
|
||||
def select_parents(population, graph):
|
||||
sorted_population = sorted(population, key=lambda p: path_length_and_time(p, graph)[0])
|
||||
return sorted_population[:len(sorted_population) // 2]
|
||||
|
||||
|
||||
# Функция для скрещивания (кроссинговера)
|
||||
def crossover(parent1, parent2):
|
||||
crossover_point = len(parent1) // 2
|
||||
child = parent1[:crossover_point]
|
||||
for gene in parent2:
|
||||
if gene not in child:
|
||||
child.append(gene)
|
||||
return child
|
||||
|
||||
|
||||
# Функция для мутации
|
||||
def mutate(child):
|
||||
if len(child) <= 2:
|
||||
return child # Если путь состоит из одной вершины, мутация не требуется
|
||||
index1 = random.randint(1, len(child) - 2)
|
||||
index2 = random.randint(1, len(child) - 2)
|
||||
child[index1], child[index2] = child[index2], child[index1]
|
||||
return child
|
||||
|
||||
|
||||
# Главная функция генетического алгоритма
|
||||
def genetic_algorithm(start, end, graph, population_size=100, generations=100):
|
||||
population = generate_population(population_size, start, end, graph)
|
||||
for _ in range(generations):
|
||||
parents = select_parents(population, graph)
|
||||
new_population = parents[:]
|
||||
while len(new_population) < population_size:
|
||||
parent1 = random.choice(parents)
|
||||
parent2 = random.choice(parents)
|
||||
child = crossover(parent1, parent2)
|
||||
if random.random() < 0.1: # Вероятность мутации
|
||||
child = mutate(child)
|
||||
new_population.append(child)
|
||||
population = new_population
|
||||
best_path = min(population, key=lambda p: path_length_and_time(p, graph)[0])
|
||||
best_length, start_time, end_time = path_length_and_time(best_path, graph)
|
||||
return best_path, best_length, start_time, end_time
|
||||
|
||||
|
||||
# Пример использования
|
||||
file_path = './/data_ga.json'
|
||||
graph, start_point, end_point = load_graph_from_json(file_path)
|
||||
|
||||
best_path, length, start_time, end_time = genetic_algorithm(start_point, end_point, graph)
|
||||
print("Наиболее короткий путь от", start_point, "до", end_point, ":", best_path)
|
||||
print("Длина пути:", length)
|
||||
print("Начальное время:", start_time.strftime("%d %B %Y %H:%M"))
|
||||
print("Конечное время:", end_time.strftime("%d %B %Y %H:%M"))
|
@ -1,29 +0,0 @@
|
||||
from matplotlib.lines import Line2D
|
||||
|
||||
vertex = ((0, 1), (1, 1), (0.5, 0.8), (0.1, 0.5), (0.8, 0.2), (0.4, 0))
|
||||
|
||||
vx = [v[0] for v in vertex]
|
||||
vy = [v[1] for v in vertex]
|
||||
|
||||
def show_graph(ax, best):
|
||||
ax.add_line(Line2D((vertex[0][0], vertex[1][0]), (vertex[0][1], vertex[1][1]), color='#aaa'))
|
||||
ax.add_line(Line2D((vertex[0][0], vertex[2][0]), (vertex[0][1], vertex[2][1]), color='#aaa'))
|
||||
ax.add_line(Line2D((vertex[0][0], vertex[3][0]), (vertex[0][1], vertex[3][1]), color='#aaa'))
|
||||
ax.add_line(Line2D((vertex[1][0], vertex[2][0]), (vertex[1][1], vertex[2][1]), color='#aaa'))
|
||||
ax.add_line(Line2D((vertex[2][0], vertex[5][0]), (vertex[2][1], vertex[5][1]), color='#aaa'))
|
||||
ax.add_line(Line2D((vertex[2][0], vertex[4][0]), (vertex[2][1], vertex[4][1]), color='#aaa'))
|
||||
ax.add_line(Line2D((vertex[3][0], vertex[5][0]), (vertex[3][1], vertex[5][1]), color='#aaa'))
|
||||
ax.add_line(Line2D((vertex[4][0], vertex[5][0]), (vertex[4][1], vertex[5][1]), color='#aaa'))
|
||||
|
||||
startV = 0
|
||||
for i, v in enumerate(best):
|
||||
if i == 0:
|
||||
continue
|
||||
|
||||
prev = startV
|
||||
v = v[:v.index(i)+1]
|
||||
for j in v:
|
||||
ax.add_line(Line2D((vertex[prev][0], vertex[j][0]), (vertex[prev][1], vertex[j][1]), color='r'))
|
||||
prev = j
|
||||
|
||||
ax.plot(vx, vy, ' ob', markersize=15)
|
6
model.py
6
model.py
@ -5,14 +5,14 @@ from keras.src.legacy.preprocessing.text import Tokenizer
|
||||
from keras.src.utils import pad_sequences
|
||||
|
||||
# Загрузка модели
|
||||
model = tf.keras.models.load_model('best_model_lstm_negative.keras')
|
||||
model = tf.keras.models.load_model('.//neural_network/create_lstm/model/best_model_lstm_negative.keras')
|
||||
|
||||
# Загрузка токенизатора
|
||||
with open('tokenizer_lstm_lstm_negative.pickle', 'rb') as handle:
|
||||
with open('.//neural_network/create_lstm/tokenizer/tokenizer_lstm_lstm_negative.pickle', 'rb') as handle:
|
||||
tokenizer = pickle.load(handle)
|
||||
|
||||
# Загрузка названий классов
|
||||
with open('class_names_lstm_negative.txt', 'r', encoding='utf-8') as file:
|
||||
with open('.//neural_network/create_lstm/class/class_names_lstm_negative.txt', 'r', encoding='utf-8') as file:
|
||||
class_names = [line.strip() for line in file.readlines()]
|
||||
|
||||
def preprocess_text(text: str):
|
||||
|
0
neural_network/create_lstm/class/.gitkeep
Normal file
0
neural_network/create_lstm/class/.gitkeep
Normal file
149
neural_network/create_lstm/create_model_lstm_negative.py
Normal file
149
neural_network/create_lstm/create_model_lstm_negative.py
Normal file
@ -0,0 +1,149 @@
|
||||
import pickle
|
||||
|
||||
import pandas as pd
|
||||
from keras import Sequential
|
||||
from keras.src.callbacks import ModelCheckpoint
|
||||
from keras.src.legacy.preprocessing.text import Tokenizer
|
||||
from keras.src.saving import load_model
|
||||
from keras.src.utils import pad_sequences
|
||||
from matplotlib import pyplot as plt
|
||||
from tensorflow.keras.layers import Dense, Embedding, Dropout, LSTM, SpatialDropout1D
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from tensorflow.keras import utils
|
||||
from nltk import word_tokenize, SnowballStemmer
|
||||
from nltk.corpus import stopwords
|
||||
import string
|
||||
|
||||
from pymystem3 import Mystem
|
||||
|
||||
# Максимальное количество слов
|
||||
num_words = 10000
|
||||
# Максимальная длина отзыва
|
||||
max_reviews_len = 90
|
||||
# Количество классов отзыва
|
||||
nb_classes = 10
|
||||
|
||||
def remove_stopwords_and_punctuation(reviews):
|
||||
stop_words = set(stopwords.words('russian'))
|
||||
punctuation = set(string.punctuation)
|
||||
filtered_tokens = []
|
||||
|
||||
# Удаление стоп слов и пунктуаций
|
||||
for review in reviews:
|
||||
words = word_tokenize(review)
|
||||
filtered_words = [word for word in words if
|
||||
word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation]
|
||||
filtered_tokens.extend(filtered_words)
|
||||
|
||||
return filtered_tokens
|
||||
|
||||
def lemmatize_token(filtered_tokens):
|
||||
mystem = Mystem()
|
||||
lemmatized_tokens = []
|
||||
for token in filtered_tokens:
|
||||
lemmatized = mystem.lemmatize(token)[0]
|
||||
lemmatized_tokens.append(lemmatized)
|
||||
print(lemmatized)
|
||||
return lemmatized_tokens
|
||||
|
||||
def stemmer_token(filtered_tokens):
|
||||
stemmer = SnowballStemmer("russian")
|
||||
stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens]
|
||||
return stemmer_tokens
|
||||
|
||||
# Данные
|
||||
train = pd.read_csv('../dataset/filtered/filtered_dataset_negative.csv')
|
||||
train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True)
|
||||
reviews = train['text']
|
||||
print("Набор данных готов")
|
||||
|
||||
filtered_tokens = remove_stopwords_and_punctuation(reviews)
|
||||
print(filtered_tokens[:10])
|
||||
|
||||
# lemmatized_tokens = lemmatize_token(filtered_tokens)
|
||||
# print(lemmatized_tokens[:10])
|
||||
|
||||
# stemmer_tokens = stemmer_token(filtered_tokens)
|
||||
# print(stemmer_tokens[:10])
|
||||
|
||||
label_encoder = LabelEncoder()
|
||||
train['rubrics'] = label_encoder.fit_transform(train['rubrics'])
|
||||
|
||||
# Сохраняем названия классов
|
||||
class_names = label_encoder.classes_
|
||||
|
||||
y_train = utils.to_categorical(train['rubrics'], nb_classes)
|
||||
|
||||
tokenizer = Tokenizer(num_words=num_words)
|
||||
tokenizer.fit_on_texts(filtered_tokens)
|
||||
|
||||
sequences = tokenizer.texts_to_sequences(reviews)
|
||||
|
||||
x_train = pad_sequences(sequences, maxlen=max_reviews_len)
|
||||
|
||||
print('начинается создание модели')
|
||||
|
||||
# Построение модели
|
||||
model_lstm = Sequential()
|
||||
model_lstm.add(Embedding(num_words, 128, input_length=max_reviews_len))
|
||||
model_lstm.add(SpatialDropout1D(0.2))
|
||||
model_lstm.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
|
||||
model_lstm.add(Dense(nb_classes, activation='softmax'))
|
||||
|
||||
model_lstm.compile(optimizer='adam',
|
||||
loss='categorical_crossentropy',
|
||||
metrics=['accuracy'])
|
||||
|
||||
model_lstm.summary()
|
||||
|
||||
# Определим обратный вызов ModelCheckpoint
|
||||
model_lstm_save_path = './/model/best_model_lstm_negative.keras'
|
||||
checkpoint_callback_lstm = ModelCheckpoint(model_lstm_save_path,
|
||||
monitor='val_accuracy',
|
||||
save_best_only=True,
|
||||
verbose=1)
|
||||
|
||||
# Обучение модели
|
||||
history_lstm = model_lstm.fit(x_train,
|
||||
y_train,
|
||||
epochs=10,
|
||||
batch_size=128,
|
||||
validation_split=0.1,
|
||||
callbacks=[checkpoint_callback_lstm])
|
||||
|
||||
# Графики
|
||||
plt.plot(history_lstm.history['accuracy'],
|
||||
label='Доля верных ответов на обучающем наборе')
|
||||
plt.plot(history_lstm.history['val_accuracy'],
|
||||
label='Доля верных ответов на проверочном наборе')
|
||||
plt.xlabel('Эпоха обучения')
|
||||
plt.ylabel('Доля верных ответов')
|
||||
plt.legend()
|
||||
plt.show()
|
||||
|
||||
# Загрузка модели
|
||||
model_lstm = load_model('.//model/best_model_lstm_negative.keras')
|
||||
|
||||
# Пример текста отзыва пользователя
|
||||
user_review = "Не люблю пьяных людей, когда они рядом, всегда будет что то плохое"
|
||||
|
||||
# Подготовка отзыва пользователя
|
||||
filtered_tokens = remove_stopwords_and_punctuation([user_review])
|
||||
sequences = tokenizer.texts_to_sequences(filtered_tokens)
|
||||
x_user = pad_sequences(sequences, maxlen=max_reviews_len)
|
||||
|
||||
# Получение вероятности принадлежности отзыва пользователя к разным классам
|
||||
predicted_probabilities = model_lstm.predict(x_user)
|
||||
|
||||
# Вывод вероятностей с названиями классов
|
||||
for class_name, prob in zip(class_names, predicted_probabilities[0]):
|
||||
print(f"Вероятность отзыва относится к классу '{class_name}': {prob}")
|
||||
|
||||
# Сохраняем названия классов в текстовый файл
|
||||
with open('.//class/class_names_lstm_negative.txt', 'w', encoding='utf-8') as file:
|
||||
for class_name in class_names:
|
||||
file.write(f"{class_name}\n")
|
||||
|
||||
# saving
|
||||
with open('.//tokenizer/tokenizer_lstm_lstm_negative.pickle', 'wb') as handle:
|
||||
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
148
neural_network/create_lstm/create_model_lstm_positive.py
Normal file
148
neural_network/create_lstm/create_model_lstm_positive.py
Normal file
@ -0,0 +1,148 @@
|
||||
import pandas as pd
|
||||
from keras import Sequential
|
||||
from keras.src.callbacks import ModelCheckpoint
|
||||
from keras.src.legacy.preprocessing.text import Tokenizer
|
||||
from keras.src.saving import load_model
|
||||
from keras.src.utils import pad_sequences
|
||||
from matplotlib import pyplot as plt
|
||||
from tensorflow.keras.layers import Dense, Embedding, Dropout, LSTM, SpatialDropout1D
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from tensorflow.keras import utils
|
||||
from nltk import word_tokenize, SnowballStemmer
|
||||
from nltk.corpus import stopwords
|
||||
import string
|
||||
import pickle
|
||||
|
||||
from pymystem3 import Mystem
|
||||
|
||||
# Максимальное количество слов
|
||||
num_words = 10000
|
||||
# Максимальная длина отзыва
|
||||
max_reviews_len = 90
|
||||
# Количество классов отзыва
|
||||
nb_classes = 10
|
||||
|
||||
def remove_stopwords_and_punctuation(reviews):
|
||||
stop_words = set(stopwords.words('russian'))
|
||||
punctuation = set(string.punctuation)
|
||||
filtered_tokens = []
|
||||
|
||||
# Удаление стоп слов и пунктуаций
|
||||
for review in reviews:
|
||||
words = word_tokenize(review)
|
||||
filtered_words = [word for word in words if
|
||||
word.lower() not in stop_words and '\\n1' and word != "''" and word != '«' and word != '»' and word not in punctuation]
|
||||
filtered_tokens.extend(filtered_words)
|
||||
|
||||
return filtered_tokens
|
||||
|
||||
def lemmatize_token(filtered_tokens):
|
||||
mystem = Mystem()
|
||||
lemmatized_tokens = []
|
||||
for token in filtered_tokens:
|
||||
lemmatized = mystem.lemmatize(token)[0]
|
||||
lemmatized_tokens.append(lemmatized)
|
||||
print(lemmatized)
|
||||
return lemmatized_tokens
|
||||
|
||||
def stemmer_token(filtered_tokens):
|
||||
stemmer = SnowballStemmer("russian")
|
||||
stemmer_tokens = [stemmer.stem(word) for word in filtered_tokens]
|
||||
return stemmer_tokens
|
||||
|
||||
# Данные
|
||||
train = pd.read_csv('..//dataset/filtered/filtered_dataset_positive.csv')
|
||||
train.drop(['address', 'name_ru', 'rating'], axis=1, inplace=True)
|
||||
reviews = train['text']
|
||||
print("Набор данных готов")
|
||||
|
||||
filtered_tokens = remove_stopwords_and_punctuation(reviews)
|
||||
print(filtered_tokens[:10])
|
||||
|
||||
# lemmatized_tokens = lemmatize_token(filtered_tokens)
|
||||
# print(lemmatized_tokens[:10])
|
||||
|
||||
# stemmer_tokens = stemmer_token(filtered_tokens)
|
||||
# print(stemmer_tokens[:10])
|
||||
|
||||
label_encoder = LabelEncoder()
|
||||
train['rubrics'] = label_encoder.fit_transform(train['rubrics'])
|
||||
|
||||
# Сохраняем названия классов
|
||||
class_names = label_encoder.classes_
|
||||
|
||||
y_train = utils.to_categorical(train['rubrics'], nb_classes)
|
||||
|
||||
tokenizer = Tokenizer(num_words=num_words)
|
||||
tokenizer.fit_on_texts(filtered_tokens)
|
||||
|
||||
sequences = tokenizer.texts_to_sequences(reviews)
|
||||
|
||||
x_train = pad_sequences(sequences, maxlen=max_reviews_len)
|
||||
|
||||
print('начинается создание модели')
|
||||
|
||||
# Построение модели
|
||||
model_lstm = Sequential()
|
||||
model_lstm.add(Embedding(num_words, 256, input_length=max_reviews_len))
|
||||
model_lstm.add(SpatialDropout1D(0.2))
|
||||
model_lstm.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
|
||||
model_lstm.add(Dense(nb_classes, activation='softmax'))
|
||||
|
||||
model_lstm.compile(optimizer='adam',
|
||||
loss='categorical_crossentropy',
|
||||
metrics=['accuracy'])
|
||||
|
||||
model_lstm.summary()
|
||||
|
||||
# Определим обратный вызов ModelCheckpoint
|
||||
model_lstm_save_path = './/model/best_model_lstm_positive.keras'
|
||||
checkpoint_callback_lstm = ModelCheckpoint(model_lstm_save_path,
|
||||
monitor='val_accuracy',
|
||||
save_best_only=True,
|
||||
verbose=1)
|
||||
|
||||
# Обучение модели
|
||||
history_lstm = model_lstm.fit(x_train,
|
||||
y_train,
|
||||
epochs=5,
|
||||
batch_size=128,
|
||||
validation_split=0.1,
|
||||
callbacks=[checkpoint_callback_lstm])
|
||||
|
||||
# Графики
|
||||
plt.plot(history_lstm.history['accuracy'],
|
||||
label='Доля верных ответов на обучающем наборе')
|
||||
plt.plot(history_lstm.history['val_accuracy'],
|
||||
label='Доля верных ответов на проверочном наборе')
|
||||
plt.xlabel('Эпоха обучения')
|
||||
plt.ylabel('Доля верных ответов')
|
||||
plt.legend()
|
||||
plt.show()
|
||||
|
||||
# Загрузка модели
|
||||
model_lstm = load_model('.//model/best_model_lstm_positive.keras')
|
||||
|
||||
# Пример текста отзыва пользователя
|
||||
user_review = "Ой я так люблю вкусно кушать! Я бы хотел сьесть все самое вкусное в этом мире! Пончики, пиццу, вообще все все все!"
|
||||
|
||||
# Подготовка отзыва пользователя
|
||||
filtered_tokens = remove_stopwords_and_punctuation([user_review])
|
||||
sequences = tokenizer.texts_to_sequences(filtered_tokens)
|
||||
x_user = pad_sequences(sequences, maxlen=max_reviews_len)
|
||||
|
||||
# Получение вероятности принадлежности отзыва пользователя к разным классам
|
||||
predicted_probabilities = model_lstm.predict(x_user)
|
||||
|
||||
# Вывод вероятностей с названиями классов
|
||||
for class_name, prob in zip(class_names, predicted_probabilities[0]):
|
||||
print(f"Вероятность отзыва относится к классу '{class_name}': {prob}")
|
||||
|
||||
# Сохраняем названия классов в текстовый файл
|
||||
with open('.//class/class_names_lstm_positive.txt', 'w', encoding='utf-8') as file:
|
||||
for class_name in class_names:
|
||||
file.write(f"{class_name}\n")
|
||||
|
||||
# saving
|
||||
with open('.//tokenizer/tokenizer_lstm_positive.pickle', 'wb') as handle:
|
||||
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
0
neural_network/create_lstm/model/.gitkeep
Normal file
0
neural_network/create_lstm/model/.gitkeep
Normal file
0
neural_network/create_lstm/tokenizer/.gitkeep
Normal file
0
neural_network/create_lstm/tokenizer/.gitkeep
Normal file
0
neural_network/dataset/class/.gitkeep
Normal file
0
neural_network/dataset/class/.gitkeep
Normal file
54
neural_network/dataset/conversion.py
Normal file
54
neural_network/dataset/conversion.py
Normal file
@ -0,0 +1,54 @@
|
||||
import pandas as pd
|
||||
|
||||
# Загрузка датасета
|
||||
dataset = pd.read_csv('../dataset/geo-reviews-dataset-2023.csv')
|
||||
|
||||
# Создание нового DataFrame с фильтрацией по условию rating
|
||||
filtered_dataset_positive = dataset[dataset['rating'] > 3]
|
||||
filtered_dataset_negative = dataset[dataset['rating'] < 3]
|
||||
|
||||
# Удаление слов после запятой в столбце 'rubrics'
|
||||
filtered_dataset_positive.loc[:, 'rubrics'] = filtered_dataset_positive['rubrics'].apply(lambda x: x.split(';')[0] if pd.notnull(x) else x)
|
||||
filtered_dataset_negative.loc[:, 'rubrics'] = filtered_dataset_negative['rubrics'].apply(lambda x: x.split(';')[0] if pd.notnull(x) else x)
|
||||
|
||||
# Переименование рубрик
|
||||
rename_mapping = {'Пиццерия': 'Быстрое питание', 'Ресторан': 'Быстрое питание', 'Кафе': 'Быстрое питание', 'Магазин продуктов': 'Супермаркет', 'Аптека': 'Медцентр, клиника', 'Стоматологическая клиника': 'Медцентр, клиника'}
|
||||
filtered_dataset_positive.loc[:, 'rubrics'] = filtered_dataset_positive['rubrics'].replace(rename_mapping)
|
||||
filtered_dataset_negative.loc[:, 'rubrics'] = filtered_dataset_negative['rubrics'].replace(rename_mapping)
|
||||
|
||||
# Получение уникальных слов из столбца 'rubrics'
|
||||
unique_rubrics_positive = set(filtered_dataset_positive['rubrics'].value_counts().head(10).index.tolist())
|
||||
unique_rubrics_negative = set(filtered_dataset_negative['rubrics'].value_counts().head(10).index.tolist())
|
||||
|
||||
# Сохранение строк, содержащих рубрики из unique_rubrics_positive и unique_rubrics_negative
|
||||
filtered_dataset_positive = filtered_dataset_positive[filtered_dataset_positive['rubrics'].isin(unique_rubrics_positive)]
|
||||
filtered_dataset_negative = filtered_dataset_negative[filtered_dataset_negative['rubrics'].isin(unique_rubrics_negative)]
|
||||
|
||||
# Ограничение количества строк одного типа rubrics
|
||||
filtered_dataset_negative = (filtered_dataset_negative.groupby('rubrics').head(1500).reset_index(drop=True))
|
||||
filtered_dataset_positive = (filtered_dataset_positive.groupby('rubrics').head(15000).reset_index(drop=True))
|
||||
|
||||
|
||||
# Вывод количества строк для каждой rubrics в unique_rubrics_positive
|
||||
for rubric in unique_rubrics_positive:
|
||||
count = filtered_dataset_positive[filtered_dataset_positive['rubrics'] == rubric].shape[0]
|
||||
print(f"Количество строк с rubrics '{rubric}' в filtered_dataset_positive: {count}")
|
||||
|
||||
# Вывод количества строк для каждой rubrics в unique_rubrics_negative
|
||||
for rubric in unique_rubrics_negative:
|
||||
count = filtered_dataset_negative[filtered_dataset_negative['rubrics'] == rubric].shape[0]
|
||||
print(f"Количество строк с rubrics '{rubric}' в filtered_dataset_negative: {count}")
|
||||
|
||||
# Сохранение уникальных слов в файл class_positive.txt с кодировкой UTF-8
|
||||
with open('class/class_positive.txt', 'w', encoding='utf-8') as file:
|
||||
for rubric in unique_rubrics_positive:
|
||||
file.write(f"{rubric}\n")
|
||||
|
||||
# Сохранение уникальных слов в файл class_negative.txt с кодировкой UTF-8
|
||||
with open('class/class_negative.txt', 'w', encoding='utf-8') as file:
|
||||
for rubric in unique_rubrics_negative:
|
||||
file.write(f"{rubric}\n")
|
||||
|
||||
# Сохранение отфильтрованного DataFrame в новый CSV файл
|
||||
filtered_dataset_positive.to_csv('../dataset/filtered/filtered_dataset_positive.csv', index=False)
|
||||
filtered_dataset_negative.to_csv('../dataset/filtered/filtered_dataset_negative.csv', index=False)
|
18
neural_network/dataset/download_dataset.py
Normal file
18
neural_network/dataset/download_dataset.py
Normal file
@ -0,0 +1,18 @@
|
||||
import kaggle
|
||||
import zipfile
|
||||
import os
|
||||
|
||||
# Аутентификация
|
||||
kaggle.api.authenticate()
|
||||
|
||||
# Скачивание конкретного файла из набора данных
|
||||
dataset = 'kyakovlev/yandex-geo-reviews-dataset-2023'
|
||||
file_name = 'geo-reviews-dataset-2023.csv'
|
||||
kaggle.api.dataset_download_file(dataset, file_name, path='../dataset')
|
||||
|
||||
# Распаковка архива, если файл сжат
|
||||
zip_path = f'./{file_name}.zip'
|
||||
if os.path.exists(zip_path):
|
||||
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
||||
zip_ref.extractall('./')
|
||||
os.remove(zip_path) # Удаление zip файла после распаковки
|
0
neural_network/dataset/filtered/.gitkeep
Normal file
0
neural_network/dataset/filtered/.gitkeep
Normal file
Loading…
Reference in New Issue
Block a user