Files
Владимир Данилов 1f9f304e67 Лабораторная работа №7
2025-03-21 15:03:21 +04:00

196 KiB
Raw Permalink Blame History

Начало лабораторной

Выгрузка данных из csv файла в датафрейм

Импорт библиотек

In [102]:
import pandas as pd
import numpy as np
import skfuzzy as fuzz
import matplotlib.pyplot as plt
from skfuzzy import control as ctrl
from sklearn import metrics
import math

Загрузка данных

In [94]:
file_path = "./static/csv/ds_salaries.csv"
df = pd.read_csv(file_path)

print(df.columns)
df
Index(['work_year', 'experience_level', 'employment_type', 'job_title',
       'salary', 'salary_currency', 'salary_in_usd', 'employee_residence',
       'remote_ratio', 'company_location', 'company_size'],
      dtype='object')
Out[94]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
work_year experience_level employment_type job_title salary salary_currency salary_in_usd employee_residence remote_ratio company_location company_size
0 2023 SE FT Principal Data Scientist 80000 EUR 85847 ES 100 ES L
1 2023 MI CT ML Engineer 30000 USD 30000 US 100 US S
2 2023 MI CT ML Engineer 25500 USD 25500 US 100 US S
3 2023 SE FT Data Scientist 175000 USD 175000 CA 100 CA M
4 2023 SE FT Data Scientist 120000 USD 120000 CA 100 CA M
... ... ... ... ... ... ... ... ... ... ... ...
3750 2020 SE FT Data Scientist 412000 USD 412000 US 100 US L
3751 2021 MI FT Principal Data Scientist 151000 USD 151000 US 100 US L
3752 2020 EN FT Data Scientist 105000 USD 105000 US 100 US S
3753 2020 EN CT Business Data Analyst 100000 USD 100000 US 100 US L
3754 2021 SE FT Data Science Manager 7000000 INR 94665 IN 50 IN L

3755 rows × 11 columns

Определение лингвистических переменных

In [95]:
experience_mapping = {"EN": 1, "MI": 2, "SE": 3, "EX": 4}
df["experience_level"] = df["experience_level"].map(experience_mapping)

experience = ctrl.Antecedent(np.arange(df["experience_level"].min(), df["experience_level"].max() + 1, 1), "experience")
remote_ratio = ctrl.Antecedent(np.arange(df["remote_ratio"].min(), df["remote_ratio"].max() + 1, 1), "remote_ratio")
salary = ctrl.Consequent(np.arange(df["salary_in_usd"].min(), df["salary_in_usd"].max() + 1, 1), "salary")

Настройка лингвистических переменных

In [96]:
experience["junior"] = fuzz.zmf(experience.universe, df["experience_level"].min(), df["experience_level"].quantile(0.33))
experience["mid"] = fuzz.trapmf(experience.universe, [df["experience_level"].min(), df["experience_level"].quantile(0.33), df["experience_level"].quantile(0.66), df["experience_level"].max()])
experience["senior"] = fuzz.smf(experience.universe, df["experience_level"].quantile(0.66), df["experience_level"].max())
experience.view()

remote_ratio["low"] = fuzz.zmf(remote_ratio.universe, df["remote_ratio"].min(), df["remote_ratio"].quantile(0.33))
remote_ratio["medium"] = fuzz.trapmf(remote_ratio.universe, [df["remote_ratio"].min(), df["remote_ratio"].quantile(0.33), df["remote_ratio"].quantile(0.66), df["remote_ratio"].max()])
remote_ratio["high"] = fuzz.smf(remote_ratio.universe, df["remote_ratio"].quantile(0.66), df["remote_ratio"].max())
remote_ratio.view()

salary["low"] = fuzz.zmf(salary.universe, df["salary_in_usd"].min(), df["salary_in_usd"].quantile(0.33))
salary["medium"] = fuzz.trapmf(salary.universe, [df["salary_in_usd"].min(), df["salary_in_usd"].quantile(0.33), df["salary_in_usd"].quantile(0.66), df["salary_in_usd"].max()])
salary["high"] = fuzz.smf(salary.universe, df["salary_in_usd"].quantile(0.66), df["salary_in_usd"].max())
salary.view()
c:\Users\danil\AppData\Local\Programs\Python\Python312\Lib\site-packages\skfuzzy\control\fuzzyvariable.py:125: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  fig.show()
c:\Users\danil\AppData\Local\Programs\Python\Python312\Lib\site-packages\skfuzzy\membership\generatemf.py:569: RuntimeWarning: invalid value encountered in divide
  y[idx] = 2. * ((x[idx] - b) / (b - a)) ** 2.
c:\Users\danil\AppData\Local\Programs\Python\Python312\Lib\site-packages\skfuzzy\membership\generatemf.py:456: RuntimeWarning: invalid value encountered in divide
  y[idx] = 2. * ((x[idx] - a) / (b - a)) ** 2.
c:\Users\danil\AppData\Local\Programs\Python\Python312\Lib\site-packages\skfuzzy\membership\generatemf.py:459: RuntimeWarning: invalid value encountered in divide
  y[idx] = 1 - 2. * ((x[idx] - b) / (b - a)) ** 2.
c:\Users\danil\AppData\Local\Programs\Python\Python312\Lib\site-packages\skfuzzy\control\fuzzyvariable.py:125: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  fig.show()
c:\Users\danil\AppData\Local\Programs\Python\Python312\Lib\site-packages\skfuzzy\control\fuzzyvariable.py:125: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  fig.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Создание базы нечетких правил

In [109]:
rule1 = ctrl.Rule(experience["junior"] & remote_ratio["low"], salary["low"])
rule2 = ctrl.Rule(experience["junior"] & remote_ratio["medium"], salary["medium"])
rule3 = ctrl.Rule(experience["junior"] & remote_ratio["high"], salary["medium"])

rule4 = ctrl.Rule(experience["mid"] & remote_ratio["low"], salary["medium"])
rule5 = ctrl.Rule(experience["mid"] & remote_ratio["medium"], salary["medium"])
rule6 = ctrl.Rule(experience["mid"] & remote_ratio["high"], salary["high"])

rule7 = ctrl.Rule(experience["senior"] & remote_ratio["low"], salary["low"])
rule8 = ctrl.Rule(experience["senior"] & remote_ratio["medium"], salary["medium"])
rule9 = ctrl.Rule(experience["senior"] & remote_ratio["high"], salary["high"])

#rule1.view()
print(rule1)
IF experience[junior] AND remote_ratio[low] THEN salary[low]
	AND aggregation function : fmin
	OR aggregation function  : fmax

Создание нечеткой системы

In [108]:
salary_ctrl = ctrl.ControlSystem([
    rule1,
    rule2,
    rule3,
    rule4,
    rule5,
    rule6,
    rule7,
    rule8,
    rule9,
])

salary_simulation = ctrl.ControlSystemSimulation(salary_ctrl)

for rule in salary_ctrl.rules:
    print(rule)
#salary_ctrl.view()
IF experience[junior] AND remote_ratio[low] THEN salary[low]
	AND aggregation function : fmin
	OR aggregation function  : fmax
IF experience[junior] AND remote_ratio[medium] THEN salary[medium]
	AND aggregation function : fmin
	OR aggregation function  : fmax
IF experience[junior] AND remote_ratio[high] THEN salary[medium]
	AND aggregation function : fmin
	OR aggregation function  : fmax
IF experience[mid] AND remote_ratio[low] THEN salary[medium]
	AND aggregation function : fmin
	OR aggregation function  : fmax
IF experience[mid] AND remote_ratio[medium] THEN salary[medium]
	AND aggregation function : fmin
	OR aggregation function  : fmax
IF experience[mid] AND remote_ratio[high] THEN salary[high]
	AND aggregation function : fmin
	OR aggregation function  : fmax
IF experience[senior] AND remote_ratio[low] THEN salary[low]
	AND aggregation function : fmin
	OR aggregation function  : fmax
IF experience[senior] AND remote_ratio[medium] THEN salary[medium]
	AND aggregation function : fmin
	OR aggregation function  : fmax
IF experience[senior] AND remote_ratio[high] THEN salary[high]
	AND aggregation function : fmin
	OR aggregation function  : fmax

Проверка расчета выходной переменной

In [101]:
salary_simulation.input["experience"] = 3
salary_simulation.input["remote_ratio"] = 50
salary_simulation.compute()

salary_simulation.print_state()
print(salary_simulation.output["salary"])

salary.view(sim=salary_simulation)
=============
 Antecedents 
=============
Antecedent: experience              = 3
  - junior                          : 0.0
  - mid                             : 1.0
  - senior                          : 0.0
Antecedent: remote_ratio            = 50
  - low                             : 0.0
  - medium                          : 1.0
  - high                            : 0.0

=======
 Rules 
=======
RULE #0:
  IF experience[junior] AND remote_ratio[low] THEN salary[low]
	AND aggregation function : fmin
	OR aggregation function  : fmax

  Aggregation (IF-clause):
  - experience[junior]                                     : 0.0
  - remote_ratio[low]                                      : 0.0
                  experience[junior] AND remote_ratio[low] = 0.0
  Activation (THEN-clause):
                                               salary[low] : 0.0

RULE #1:
  IF experience[junior] AND remote_ratio[medium] THEN salary[medium]
	AND aggregation function : fmin
	OR aggregation function  : fmax

  Aggregation (IF-clause):
  - experience[junior]                                     : 0.0
  - remote_ratio[medium]                                   : 1.0
               experience[junior] AND remote_ratio[medium] = 0.0
  Activation (THEN-clause):
                                            salary[medium] : 0.0

RULE #2:
  IF experience[junior] AND remote_ratio[high] THEN salary[medium]
	AND aggregation function : fmin
	OR aggregation function  : fmax

  Aggregation (IF-clause):
  - experience[junior]                                     : 0.0
  - remote_ratio[high]                                     : 0.0
                 experience[junior] AND remote_ratio[high] = 0.0
  Activation (THEN-clause):
                                            salary[medium] : 0.0

RULE #3:
  IF experience[mid] AND remote_ratio[low] THEN salary[medium]
	AND aggregation function : fmin
	OR aggregation function  : fmax

  Aggregation (IF-clause):
  - experience[mid]                                        : 1.0
  - remote_ratio[low]                                      : 0.0
                     experience[mid] AND remote_ratio[low] = 0.0
  Activation (THEN-clause):
                                            salary[medium] : 0.0

RULE #4:
  IF experience[mid] AND remote_ratio[medium] THEN salary[medium]
	AND aggregation function : fmin
	OR aggregation function  : fmax

  Aggregation (IF-clause):
  - experience[mid]                                        : 1.0
  - remote_ratio[medium]                                   : 1.0
                  experience[mid] AND remote_ratio[medium] = 1.0
  Activation (THEN-clause):
                                            salary[medium] : 1.0

RULE #5:
  IF experience[mid] AND remote_ratio[high] THEN salary[high]
	AND aggregation function : fmin
	OR aggregation function  : fmax

  Aggregation (IF-clause):
  - experience[mid]                                        : 1.0
  - remote_ratio[high]                                     : 0.0
                    experience[mid] AND remote_ratio[high] = 0.0
  Activation (THEN-clause):
                                              salary[high] : 0.0

RULE #6:
  IF experience[senior] AND remote_ratio[low] THEN salary[low]
	AND aggregation function : fmin
	OR aggregation function  : fmax

  Aggregation (IF-clause):
  - experience[senior]                                     : 0.0
  - remote_ratio[low]                                      : 0.0
                  experience[senior] AND remote_ratio[low] = 0.0
  Activation (THEN-clause):
                                               salary[low] : 0.0

RULE #7:
  IF experience[senior] AND remote_ratio[medium] THEN salary[medium]
	AND aggregation function : fmin
	OR aggregation function  : fmax

  Aggregation (IF-clause):
  - experience[senior]                                     : 0.0
  - remote_ratio[medium]                                   : 1.0
               experience[senior] AND remote_ratio[medium] = 0.0
  Activation (THEN-clause):
                                            salary[medium] : 0.0

RULE #8:
  IF experience[senior] AND remote_ratio[high] THEN salary[high]
	AND aggregation function : fmin
	OR aggregation function  : fmax

  Aggregation (IF-clause):
  - experience[senior]                                     : 0.0
  - remote_ratio[high]                                     : 0.0
                 experience[senior] AND remote_ratio[high] = 0.0
  Activation (THEN-clause):
                                              salary[high] : 0.0


==============================
 Intermediaries and Conquests 
==============================
Consequent: salary                   = 193314.1574714936
  low:
    Accumulate using accumulation_max : 0.0
  medium:
    Accumulate using accumulation_max : 1.0
  high:
    Accumulate using accumulation_max : 0.0

193314.1574714936
c:\Users\danil\AppData\Local\Programs\Python\Python312\Lib\site-packages\skfuzzy\control\fuzzyvariable.py:125: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  fig.show()
C:\Users\danil\AppData\Roaming\Python\Python312\site-packages\IPython\core\events.py:82: UserWarning: Creating legend with loc="best" can be slow with large amounts of data.
  func(*args, **kwargs)
C:\Users\danil\AppData\Roaming\Python\Python312\site-packages\IPython\core\pylabtools.py:170: UserWarning: Creating legend with loc="best" can be slow with large amounts of data.
  fig.canvas.print_figure(bytes_io, **kw)
No description has been provided for this image

Оценка качества предсказаний

In [103]:
def fuzzy_pred(row):
    salary_simulation.input["experience"] = row["experience_level"]
    salary_simulation.input["remote_ratio"] = row["remote_ratio"]
    salary_simulation.compute()
    return salary_simulation.output["salary"]

result = df.copy()
result["salary_pred"] = result.apply(fuzzy_pred, axis=1)

print(result.loc[115:130, ["experience_level", "remote_ratio", "salary_in_usd", "salary_pred"]])

rmetrics = {}
rmetrics["RMSE"] = math.sqrt(metrics.mean_squared_error(result["salary_in_usd"], result["salary_pred"]))
rmetrics["RMAE"] = math.sqrt(metrics.mean_absolute_error(result["salary_in_usd"], result["salary_pred"]))
rmetrics["R2"] = metrics.r2_score(result["salary_in_usd"], result["salary_pred"])
print(rmetrics)
     experience_level  remote_ratio  salary_in_usd    salary_pred
115                 3             0         150000  193314.157471
116                 3             0         120000  193314.157471
117                 3             0         289800  193314.157471
118                 3             0         214000  193314.157471
119                 3             0         179820  193314.157471
120                 3             0         143860  193314.157471
121                 3           100         283200  245596.723043
122                 3           100         188800  245596.723043
123                 3             0         289800  193314.157471
124                 3             0         214200  193314.157471
125                 3             0         185900  193314.157471
126                 3             0         129300  193314.157471
127                 3             0         252000  193314.157471
128                 3             0         129000  193314.157471
129                 2             0         155000  206449.195020
130                 2             0         140000  206449.195020
{'RMSE': 102064.14202964267, 'RMAE': 296.77197859645963, 'R2': -1.620682315659578}

Вывод: данные прогнозируются с низкой точностью и допускается множество ошибок