diff --git a/schemas.py b/schemas.py index d9efd95..954a90f 100644 --- a/schemas.py +++ b/schemas.py @@ -42,6 +42,7 @@ class TripRequest(BaseModel): class Review(BaseModel): city: str + airport: str address: str name_ru: str diff --git a/sity/conversion_sity.py b/sity/conversion_sity.py index 9357a0b..e921765 100644 --- a/sity/conversion_sity.py +++ b/sity/conversion_sity.py @@ -4,18 +4,20 @@ import pandas as pd class CityFilter: def __init__(self, json_file_path): self.json_file_path = json_file_path - self.cities_set = self.load_and_extract_cities() + self.cities_airports = self.load_and_extract_cities_airports() - def load_and_extract_cities(self): + def load_and_extract_cities_airports(self): data = self.load_json(self.json_file_path) - cities_set = set() + cities_airports = {} for entry in data: parts = entry['label'].split(',') if len(parts) > 1: - city1 = parts[0].strip().split()[0] if parts[0].strip().split() else '' - city2 = parts[1].strip().split()[0] if parts[1].strip().split() else '' - cities_set.update([city1, city2]) - return cities_set + city1 = ' '.join(parts[0].strip().split()[:2]) # Возьмем первые два слова + city2 = ' '.join(parts[1].strip().split()[:2]) # Возьмем первые два слова + full_address = entry['label'].strip() + cities_airports[city1] = full_address + cities_airports[city2] = full_address + return cities_airports @staticmethod def load_json(file_path): @@ -24,23 +26,26 @@ class CityFilter: return data @staticmethod - def find_city(address, cities_set): + def find_city_and_airport(address, cities_airports): parts = address.split(',') for part in parts: words = part.strip().split() - for word in words: - if word in cities_set: - return word - return None + for i in range(len(words)): + city_1 = ' '.join(words[i:i+1]) + city_2 = ' '.join(words[i:i+2]) + if city_1 in cities_airports: + return city_1, cities_airports[city_1] + if city_2 in cities_airports: + return city_2, cities_airports[city_2] + return None, None def filter_cities_in_csv(self, csv_file_path, output_path): df = pd.read_csv(csv_file_path) - df['city'] = df['address'].apply(lambda x: self.find_city(x, self.cities_set)) + df['city'], df['airport'] = zip(*df['address'].apply(lambda x: self.find_city_and_airport(x, self.cities_airports))) df = df[df['city'].notnull()] df.to_csv(output_path, index=False) print(f"Filtered entries:\n{df.head(15)}") - # Пример использования класса json_file_path = 'airports.json' csv_file_path_positive = '../neural_network/dataset/filtered/filtered_dataset_positive.csv'