54 lines
2.0 KiB
Python
54 lines
2.0 KiB
Python
|
import json
|
||
|
import pandas as pd
|
||
|
|
||
|
class CityFilter:
|
||
|
def __init__(self, json_file_path):
|
||
|
self.json_file_path = json_file_path
|
||
|
self.cities_set = self.load_and_extract_cities()
|
||
|
|
||
|
def load_and_extract_cities(self):
|
||
|
data = self.load_json(self.json_file_path)
|
||
|
cities_set = set()
|
||
|
for entry in data:
|
||
|
parts = entry['label'].split(',')
|
||
|
if len(parts) > 1:
|
||
|
city1 = parts[0].strip().split()[0] if parts[0].strip().split() else ''
|
||
|
city2 = parts[1].strip().split()[0] if parts[1].strip().split() else ''
|
||
|
cities_set.update([city1, city2])
|
||
|
return cities_set
|
||
|
|
||
|
@staticmethod
|
||
|
def load_json(file_path):
|
||
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||
|
data = json.load(file)
|
||
|
return data
|
||
|
|
||
|
@staticmethod
|
||
|
def find_city(address, cities_set):
|
||
|
parts = address.split(',')
|
||
|
for part in parts:
|
||
|
words = part.strip().split()
|
||
|
for word in words:
|
||
|
if word in cities_set:
|
||
|
return word
|
||
|
return None
|
||
|
|
||
|
def filter_cities_in_csv(self, csv_file_path, output_path):
|
||
|
df = pd.read_csv(csv_file_path)
|
||
|
df['city'] = df['address'].apply(lambda x: self.find_city(x, self.cities_set))
|
||
|
df = df[df['city'].notnull()]
|
||
|
df.to_csv(output_path, index=False)
|
||
|
print(f"Filtered entries:\n{df.head(15)}")
|
||
|
|
||
|
|
||
|
# Пример использования класса
|
||
|
json_file_path = 'airports.json'
|
||
|
csv_file_path_positive = '../neural_network/dataset/filtered/filtered_dataset_positive.csv'
|
||
|
csv_file_path_negative = '../neural_network/dataset/filtered/filtered_dataset_negative.csv'
|
||
|
positive_output_path_negative = '../sity/sity_negative.csv'
|
||
|
negative_output_path_positive = '../sity/sity_positive.csv'
|
||
|
|
||
|
city_filter = CityFilter(json_file_path)
|
||
|
city_filter.filter_cities_in_csv(csv_file_path_positive, negative_output_path_positive)
|
||
|
city_filter.filter_cities_in_csv(csv_file_path_negative, positive_output_path_negative)
|