Files
AIM-PIbd-31-Yaruskin-S-A/lab_6/laba6.ipynb
2025-03-01 04:31:18 +04:00

277 KiB
Raw Blame History

Решение регресии на основе нечеткого логического вывода

Подготовил данные. Перегнал категории в числа. Матрица нужна для поиска лучших стран. Создал тестовые и тренировочные выборки

In [246]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import skfuzzy as fuzz
import numpy as np
from skfuzzy import control as ctrl


data = pd.read_csv("..//static//csv//ds_salaries.csv")


countries_by_quality_of_life = [
    'FI', 'SE', 'DK', 'NL', 'CH', 'AU', 'CA', 'DE', 'GB', 'IE', 
    'FR', 'US', 'PT', 'ES', 'AT', 'IT', 'SG', 'UA', 'KR', 'BE', 
    'SG', 'NO', 'PL', 'LV', 'SK', 'HR', 'CZ', 'LI', 'LT', 'RO', 
    'GR', 'HU', 'PH', 'MY', 'IN', 'TH', 'BR', 'CL', 'KR', 'TR',
    'CO', 'MX', 'UA', 'PR', 'AE', 'IL', 'KR', 'NG', 'RS', 'ID', 
    'RU', 'PT', 'JP', 'SI', 'UA', 'TH', 'EG', 'IR', 'IN'
]

order_dict = {label: i for i, label in enumerate(countries_by_quality_of_life)}

exp_label_encoder = LabelEncoder()
res_label_encoder = LabelEncoder()

data['experience_level_encoded'] = exp_label_encoder.fit_transform(data['experience_level'])
data["employee_residence_encoded"] = res_label_encoder.fit_transform(data['employee_residence'].map(order_dict))

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

train_data.to_csv("train_data.csv", index=False)
test_data.to_csv("test_data.csv", index=False)

train_test_data = pd.read_csv("train_data.csv")
test_data_save = pd.read_csv("train_data.csv")

Инициализация лингвистических переменных и автоматическое формирование нечетких переменных

In [247]:
experience_level = ctrl.Antecedent(train_test_data["experience_level_encoded"].sort_values().unique(), "exp")
residence_level = ctrl.Antecedent(train_test_data["employee_residence_encoded"].sort_values().unique(), "res")
salary = ctrl.Consequent(np.arange(5000, 850000, 1000), "salary")

experience_level.automf(3, variable_type="categorical")
experience_level.view()
residence_level.automf(3, variable_type="categorical")
residence_level.view()
salary.automf(5, variable_type="quant")
salary.view()
c:\Users\salih\AppData\Local\Programs\Python\Python312\Lib\site-packages\skfuzzy\control\fuzzyvariable.py:125: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  fig.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Написал правила

In [248]:
rule1 = ctrl.Rule(experience_level["low"] & residence_level["high"], salary["lower"])
rule2 = ctrl.Rule(experience_level["low"] & residence_level["average"], salary["low"])
rule3 = ctrl.Rule(experience_level["low"] & residence_level["low"], salary["low"])

rule4 = ctrl.Rule(experience_level["average"] & residence_level["high"], salary["low"])
rule5 = ctrl.Rule(experience_level["average"] & residence_level["average"], salary["average"])
rule6 = ctrl.Rule(experience_level["average"] & residence_level["low"], salary["high"])

rule7 = ctrl.Rule(experience_level["high"] & residence_level["high"], salary["low"])
rule8 = ctrl.Rule(experience_level["high"] & residence_level["average"], salary["high"])
rule9 = ctrl.Rule(experience_level["high"] & residence_level["low"], salary["higher"])

rule1.view()

fuzzy_rules = [
    rule1,
    rule2,
    rule3,
    rule4,
    rule5,
    rule6,
    rule7,
    rule8,
    rule9
]

salary_ctrl = ctrl.ControlSystem(fuzzy_rules)

salary_sim = ctrl.ControlSystemSimulation(salary_ctrl)

fuzzy_rules
Out[248]:
[IF exp[low] AND res[high] THEN salary[lower]
 	AND aggregation function : fmin
 	OR aggregation function  : fmax,
 IF exp[low] AND res[average] THEN salary[low]
 	AND aggregation function : fmin
 	OR aggregation function  : fmax,
 IF exp[low] AND res[low] THEN salary[low]
 	AND aggregation function : fmin
 	OR aggregation function  : fmax,
 IF exp[average] AND res[high] THEN salary[low]
 	AND aggregation function : fmin
 	OR aggregation function  : fmax,
 IF exp[average] AND res[average] THEN salary[average]
 	AND aggregation function : fmin
 	OR aggregation function  : fmax,
 IF exp[average] AND res[low] THEN salary[high]
 	AND aggregation function : fmin
 	OR aggregation function  : fmax,
 IF exp[high] AND res[high] THEN salary[low]
 	AND aggregation function : fmin
 	OR aggregation function  : fmax,
 IF exp[high] AND res[average] THEN salary[high]
 	AND aggregation function : fmin
 	OR aggregation function  : fmax,
 IF exp[high] AND res[low] THEN salary[higher]
 	AND aggregation function : fmin
 	OR aggregation function  : fmax]
No description has been provided for this image

Проверка работы. Работате корректно

In [249]:
salary_sim.input["exp"] = 1
salary_sim.input["res"] = 1
salary_sim.compute()
salary_sim.print_state()
display(salary_sim.output["salary"])
=============
 Antecedents 
=============
Antecedent: exp                     = 1
  - low                             : 0.3333333333333333
  - average                         : 0.6666666666666666
  - high                            : 0.0
Antecedent: res                     = 1
  - low                             : 0.9583333333333334
  - average                         : 0.041666666666666664
  - high                            : 0.0

=======
 Rules 
=======
RULE #0:
  IF exp[low] AND res[high] THEN salary[lower]
	AND aggregation function : fmin
	OR aggregation function  : fmax

  Aggregation (IF-clause):
  - exp[low]                                               : 0.3333333333333333
  - res[high]                                              : 0.0
                                    exp[low] AND res[high] = 0.0
  Activation (THEN-clause):
                                             salary[lower] : 0.0

RULE #1:
  IF exp[low] AND res[average] THEN salary[low]
	AND aggregation function : fmin
	OR aggregation function  : fmax

  Aggregation (IF-clause):
  - exp[low]                                               : 0.3333333333333333
  - res[average]                                           : 0.041666666666666664
                                 exp[low] AND res[average] = 0.041666666666666664
  Activation (THEN-clause):
                                               salary[low] : 0.041666666666666664

RULE #2:
  IF exp[low] AND res[low] THEN salary[low]
	AND aggregation function : fmin
	OR aggregation function  : fmax

  Aggregation (IF-clause):
  - exp[low]                                               : 0.3333333333333333
  - res[low]                                               : 0.9583333333333334
                                     exp[low] AND res[low] = 0.3333333333333333
  Activation (THEN-clause):
                                               salary[low] : 0.3333333333333333

RULE #3:
  IF exp[average] AND res[high] THEN salary[low]
	AND aggregation function : fmin
	OR aggregation function  : fmax

  Aggregation (IF-clause):
  - exp[average]                                           : 0.6666666666666666
  - res[high]                                              : 0.0
                                exp[average] AND res[high] = 0.0
  Activation (THEN-clause):
                                               salary[low] : 0.0

RULE #4:
  IF exp[average] AND res[average] THEN salary[average]
	AND aggregation function : fmin
	OR aggregation function  : fmax

  Aggregation (IF-clause):
  - exp[average]                                           : 0.6666666666666666
  - res[average]                                           : 0.041666666666666664
                             exp[average] AND res[average] = 0.041666666666666664
  Activation (THEN-clause):
                                           salary[average] : 0.041666666666666664

RULE #5:
  IF exp[average] AND res[low] THEN salary[high]
	AND aggregation function : fmin
	OR aggregation function  : fmax

  Aggregation (IF-clause):
  - exp[average]                                           : 0.6666666666666666
  - res[low]                                               : 0.9583333333333334
                                 exp[average] AND res[low] = 0.6666666666666666
  Activation (THEN-clause):
                                              salary[high] : 0.6666666666666666

RULE #6:
  IF exp[high] AND res[high] THEN salary[low]
	AND aggregation function : fmin
	OR aggregation function  : fmax

  Aggregation (IF-clause):
  - exp[high]                                              : 0.0
  - res[high]                                              : 0.0
                                   exp[high] AND res[high] = 0.0
  Activation (THEN-clause):
                                               salary[low] : 0.0

RULE #7:
  IF exp[high] AND res[average] THEN salary[high]
	AND aggregation function : fmin
	OR aggregation function  : fmax

  Aggregation (IF-clause):
  - exp[high]                                              : 0.0
  - res[average]                                           : 0.041666666666666664
                                exp[high] AND res[average] = 0.0
  Activation (THEN-clause):
                                              salary[high] : 0.0

RULE #8:
  IF exp[high] AND res[low] THEN salary[higher]
	AND aggregation function : fmin
	OR aggregation function  : fmax

  Aggregation (IF-clause):
  - exp[high]                                              : 0.0
  - res[low]                                               : 0.9583333333333334
                                    exp[high] AND res[low] = 0.0
  Activation (THEN-clause):
                                            salary[higher] : 0.0


==============================
 Intermediaries and Conquests 
==============================
Consequent: salary                   = 475633.7289605768
  lower:
    Accumulate using accumulation_max : 0.0
  low:
    Accumulate using accumulation_max : 0.3333333333333333
  average:
    Accumulate using accumulation_max : 0.041666666666666664
  high:
    Accumulate using accumulation_max : 0.6666666666666666
  higher:
    Accumulate using accumulation_max : 0.0

np.float64(475633.7289605768)
In [250]:
salary.view(sim=salary_sim)
No description has been provided for this image

Функция для вычисления целевой переменной

In [251]:
def fuzze_pred(row):
    salary_sim.input["exp"] = row["experience_level_encoded"]
    salary_sim.input["res"] = row["employee_residence_encoded"]
    salary_sim.compute()
    return salary_sim.output["salary"]

Тестирование на обучающей выборки

In [252]:
result_data = train_test_data.copy()
result_data["salary_pred"] = result_data.apply(fuzze_pred, axis=1).round()

result_data[["salary", "salary_pred"]].head(20)
Out[252]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
salary salary_pred
0 35000 663119.0
1 151800 551730.0
2 100000 667726.0
3 156600 667726.0
4 175000 667726.0
5 220000 667726.0
6 104650 667726.0
7 42000 572764.0
8 90000 551730.0
9 140000 667726.0
10 100000 216000.0
11 106250 551730.0
12 215000 667726.0
13 135000 551730.0
14 164996 667726.0
15 216200 551730.0
16 100000 216000.0
17 350000 572764.0
18 60400 551730.0
19 125000 667726.0

Тестирование на тестовой выборки

In [ ]:
result_test_data = test_data_save.copy()
result_test_data["salary_pred"] = result_test_data.apply(fuzze_pred, axis=1).round()

result_test_data[["experience_level_encoded", "employee_residence_encoded","salary", "salary_pred"]].head(10)
Out[ ]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
experience_level_encoded employee_residence_encoded salary salary_pred
0 3 12 35000 663119.0
1 2 11 151800 551730.0
2 3 11 100000 667726.0
3 3 11 156600 667726.0
4 3 11 175000 667726.0
... ... ... ... ...
95 3 48 35000 216000.0
96 3 12 45000 663119.0
97 3 11 70000 667726.0
98 3 11 75000 667726.0
99 3 11 161000 667726.0

100 rows × 4 columns

Анализ получившегося на основе метрик

In [254]:
import math
from sklearn import metrics


rmetrics = {}
rmetrics["RMSE_train"] = math.sqrt(
    metrics.mean_squared_error(result_data["salary"], result_data["salary_pred"])
)
rmetrics["RMSE_test"] = math.sqrt(
    metrics.mean_squared_error(result_test_data["salary"], result_test_data["salary_pred"])
)
rmetrics["RMAE_test"] = math.sqrt(
    metrics.mean_absolute_error(result_test_data["salary"], result_test_data["salary_pred"])
)
rmetrics["R2_test"] = metrics.r2_score(
    result_test_data["salary"], result_test_data["salary_pred"]
)

rmetrics
Out[254]:
{'RMSE_train': 850323.5180232156,
 'RMSE_test': 850323.5180232156,
 'RMAE_test': 705.3551331408898,
 'R2_test': -0.3826308706791317}

Модель получилась просто ужасной, RMSE слишком большой, R^2 отрицательный. Очень не понятный дата сет, я не смог найти какие-то зависимости и, соответсвенно, не смог корректно настроить модель.