From a7aacfa323cd5596fe0a590962d2e0e5d35c386c Mon Sep 17 00:00:00 2001 From: maksim Date: Tue, 4 Jun 2024 00:17:40 +0400 Subject: [PATCH] =?UTF-8?q?=D0=94=D0=BE=D0=B1=D0=B0=D0=B2=D0=BB=D0=B8=20?= =?UTF-8?q?=D1=88=D1=82=D1=83=D0=BA=D1=83,=20=D0=BA=D0=BE=D1=82=D0=BE?= =?UTF-8?q?=D1=80=D0=B0=D1=8F=20=D0=BF=D1=80=D0=BE=D1=81=D1=82=D0=BE=20?= =?UTF-8?q?=D1=81=D0=BE=D1=80=D1=82=D0=B8=D1=80=D1=83=D0=B5=D1=82.=20?= =?UTF-8?q?=D0=92=D1=81=D0=B5=20=D1=80=D0=B0=D0=B1=D0=BE=D1=82=D1=82=D0=B0?= =?UTF-8?q?=D0=B5=D1=82,=20=D0=BE=D1=81=D1=82=D0=BB=D0=BE=D1=81=D1=8C=20?= =?UTF-8?q?=D0=BD=D0=BE=D1=80=D0=BC=20=D0=B2=20=D0=BF=D1=80=D0=BE=D0=B5?= =?UTF-8?q?=D0=BA=D1=82=20=D1=81=D1=83=D0=BD=D1=83=D1=82=D1=8C=20=D0=B8=20?= =?UTF-8?q?=D0=BD=D0=B0=D0=BF=D0=B8=D1=81=D0=B0=D1=82=D1=8C=20API?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- neural_network/dataset/sortTest.py | 51 ++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 neural_network/dataset/sortTest.py diff --git a/neural_network/dataset/sortTest.py b/neural_network/dataset/sortTest.py new file mode 100644 index 0000000..d312471 --- /dev/null +++ b/neural_network/dataset/sortTest.py @@ -0,0 +1,51 @@ +import json +import pandas as pd + +# JSON данные +json_data = ''' +[ + { "value": "AAQ", "label": "Анапа, Витязево, AAQ", "timezone": "+3", "latitude": 45.0029, "longitude": 37.3473 }, + { "value": "ARH", "label": "Архангельск, Талаги, ARH", "timezone": "+3", "latitude": 64.6003, "longitude": 40.7168 }, + { "value": "ASF", "label": "Астрахань, им. Б.Н. Кустодиева, ASF", "timezone": "+4", "latitude": 46.2833, "longitude": 48.0063 }, + { "value": "BAX", "label": "Барнаул, Михайловка, BAX", "timezone": "+7", "latitude": 53.3638, "longitude": 83.5385 } +] +''' + +data = json.loads(json_data) + +# Извлекаем первое слово перед запятой и первое слово после запятой +cities_set = set() +for entry in data: + parts = entry['label'].split(',') + if len(parts) > 1: + city1 = parts[0].strip().split()[0] if parts[0].strip().split() else '' + city2 = parts[1].strip().split()[0] if parts[1].strip().split() else '' + cities_set.update([city1, city2]) + +# Прочитаем CSV файл +csv_file_path = 'geo-reviews-dataset-2023.csv' +df = pd.read_csv(csv_file_path) + +# Функция для проверки совпадения города в адресе +def find_city(address, cities_set): + parts = address.split(',') + if len(parts) > 1: + word1 = parts[0].strip().split()[0] if parts[0].strip().split() else '' + word2 = parts[1].strip().split()[0] if parts[1].strip().split() else '' + if word1 in cities_set: + return word1 + elif word2 in cities_set: + return word2 + return None + +# Добавим новый столбец на основе первого слова из адреса +df['city'] = df['address'].apply(lambda x: find_city(x, cities_set)) + +# Оставим только те строки, где город из CSV файла совпадает с городом из JSON +df_filtered = df[df['city'].notnull()] + +# Сохраним отсортированный DataFrame обратно в CSV +output_file_path = 'sorted_filtered_geo-reviews-dataset-2023.csv' +df_filtered.to_csv(output_file_path, index=False) + +print(df_filtered[:15])