277 KiB
277 KiB
Решение регресии на основе нечеткого логического вывода¶
Подготовил данные. Перегнал категории в числа. Матрица нужна для поиска лучших стран. Создал тестовые и тренировочные выборки
In [246]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import skfuzzy as fuzz
import numpy as np
from skfuzzy import control as ctrl
data = pd.read_csv("..//static//csv//ds_salaries.csv")
countries_by_quality_of_life = [
'FI', 'SE', 'DK', 'NL', 'CH', 'AU', 'CA', 'DE', 'GB', 'IE',
'FR', 'US', 'PT', 'ES', 'AT', 'IT', 'SG', 'UA', 'KR', 'BE',
'SG', 'NO', 'PL', 'LV', 'SK', 'HR', 'CZ', 'LI', 'LT', 'RO',
'GR', 'HU', 'PH', 'MY', 'IN', 'TH', 'BR', 'CL', 'KR', 'TR',
'CO', 'MX', 'UA', 'PR', 'AE', 'IL', 'KR', 'NG', 'RS', 'ID',
'RU', 'PT', 'JP', 'SI', 'UA', 'TH', 'EG', 'IR', 'IN'
]
order_dict = {label: i for i, label in enumerate(countries_by_quality_of_life)}
exp_label_encoder = LabelEncoder()
res_label_encoder = LabelEncoder()
data['experience_level_encoded'] = exp_label_encoder.fit_transform(data['experience_level'])
data["employee_residence_encoded"] = res_label_encoder.fit_transform(data['employee_residence'].map(order_dict))
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data.to_csv("train_data.csv", index=False)
test_data.to_csv("test_data.csv", index=False)
train_test_data = pd.read_csv("train_data.csv")
test_data_save = pd.read_csv("train_data.csv")
Инициализация лингвистических переменных и автоматическое формирование нечетких переменных
In [247]:
experience_level = ctrl.Antecedent(train_test_data["experience_level_encoded"].sort_values().unique(), "exp")
residence_level = ctrl.Antecedent(train_test_data["employee_residence_encoded"].sort_values().unique(), "res")
salary = ctrl.Consequent(np.arange(5000, 850000, 1000), "salary")
experience_level.automf(3, variable_type="categorical")
experience_level.view()
residence_level.automf(3, variable_type="categorical")
residence_level.view()
salary.automf(5, variable_type="quant")
salary.view()
Написал правила
In [248]:
rule1 = ctrl.Rule(experience_level["low"] & residence_level["high"], salary["lower"])
rule2 = ctrl.Rule(experience_level["low"] & residence_level["average"], salary["low"])
rule3 = ctrl.Rule(experience_level["low"] & residence_level["low"], salary["low"])
rule4 = ctrl.Rule(experience_level["average"] & residence_level["high"], salary["low"])
rule5 = ctrl.Rule(experience_level["average"] & residence_level["average"], salary["average"])
rule6 = ctrl.Rule(experience_level["average"] & residence_level["low"], salary["high"])
rule7 = ctrl.Rule(experience_level["high"] & residence_level["high"], salary["low"])
rule8 = ctrl.Rule(experience_level["high"] & residence_level["average"], salary["high"])
rule9 = ctrl.Rule(experience_level["high"] & residence_level["low"], salary["higher"])
rule1.view()
fuzzy_rules = [
rule1,
rule2,
rule3,
rule4,
rule5,
rule6,
rule7,
rule8,
rule9
]
salary_ctrl = ctrl.ControlSystem(fuzzy_rules)
salary_sim = ctrl.ControlSystemSimulation(salary_ctrl)
fuzzy_rules
Out[248]:
Проверка работы. Работате корректно
In [249]:
salary_sim.input["exp"] = 1
salary_sim.input["res"] = 1
salary_sim.compute()
salary_sim.print_state()
display(salary_sim.output["salary"])
In [250]:
salary.view(sim=salary_sim)
Функция для вычисления целевой переменной
In [251]:
def fuzze_pred(row):
salary_sim.input["exp"] = row["experience_level_encoded"]
salary_sim.input["res"] = row["employee_residence_encoded"]
salary_sim.compute()
return salary_sim.output["salary"]
Тестирование на обучающей выборки
In [252]:
result_data = train_test_data.copy()
result_data["salary_pred"] = result_data.apply(fuzze_pred, axis=1).round()
result_data[["salary", "salary_pred"]].head(20)
Out[252]:
Тестирование на тестовой выборки
In [ ]:
result_test_data = test_data_save.copy()
result_test_data["salary_pred"] = result_test_data.apply(fuzze_pred, axis=1).round()
result_test_data[["experience_level_encoded", "employee_residence_encoded","salary", "salary_pred"]].head(10)
Out[ ]:
Анализ получившегося на основе метрик
In [254]:
import math
from sklearn import metrics
rmetrics = {}
rmetrics["RMSE_train"] = math.sqrt(
metrics.mean_squared_error(result_data["salary"], result_data["salary_pred"])
)
rmetrics["RMSE_test"] = math.sqrt(
metrics.mean_squared_error(result_test_data["salary"], result_test_data["salary_pred"])
)
rmetrics["RMAE_test"] = math.sqrt(
metrics.mean_absolute_error(result_test_data["salary"], result_test_data["salary_pred"])
)
rmetrics["R2_test"] = metrics.r2_score(
result_test_data["salary"], result_test_data["salary_pred"]
)
rmetrics
Out[254]:
Модель получилась просто ужасной, RMSE слишком большой, R^2 отрицательный. Очень не понятный дата сет, я не смог найти какие-то зависимости и, соответсвенно, не смог корректно настроить модель.