AIM-PIbd-32-Kaznacheeva-E-K/Lab4.ipynb at e64d5b2980098ce7cddcc6d78b4ccc5e34d57c4b

2024-11-23 12:17:48 +04:00

201 KiB

Raw Blame History

Начало лабораторной работы

In [8]:

import pandas as pd
df = pd.read_csv("..//static//csv//ds_salaries.csv")
print(df.columns)

Index(['work_year', 'experience_level', 'employment_type', 'job_title',
       'salary', 'salary_currency', 'salary_in_usd', 'employee_residence',
       'remote_ratio', 'company_location', 'company_size'],
      dtype='object')

In [10]:

df.head()

Out[10]:

	work_year	experience_level	employment_type	job_title	salary	salary_currency	salary_in_usd	employee_residence	remote_ratio	company_location	company_size
0	2023	SE	FT	Principal Data Scientist	80000	EUR	85847	ES	100	ES	L
1	2023	MI	CT	ML Engineer	30000	USD	30000	US	100	US	S
2	2023	MI	CT	ML Engineer	25500	USD	25500	US	100	US	S
3	2023	SE	FT	Data Scientist	175000	USD	175000	CA	100	CA	M
4	2023	SE	FT	Data Scientist	120000	USD	120000	CA	100	CA	M

In [9]:

df.describe()

Out[9]:

	work_year	salary	salary_in_usd	remote_ratio
count	3755.000000	3.755000e+03	3755.000000	3755.000000
mean	2022.373635	1.906956e+05	137570.389880	46.271638
std	0.691448	6.716765e+05	63055.625278	48.589050
min	2020.000000	6.000000e+03	5132.000000	0.000000
25%	2022.000000	1.000000e+05	95000.000000	0.000000
50%	2022.000000	1.380000e+05	135000.000000	0.000000
75%	2023.000000	1.800000e+05	175000.000000	100.000000
max	2023.000000	3.040000e+07	450000.000000	100.000000

In [11]:

# Процент пропущенных значений признаков
for i in df.columns:
    null_rate = df[i].isnull().sum() / len(df) * 100
    if null_rate > 0:
        print(f'{i} Процент пустых значений: %{null_rate:.2f}')

print(df.isnull().sum())

print(df.isnull().any())

work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64
work_year             False
experience_level      False
employment_type       False
job_title             False
salary                False
salary_currency       False
salary_in_usd         False
employee_residence    False
remote_ratio          False
company_location      False
company_size          False
dtype: bool

In [13]:

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Tuple
import pandas as pd
from pandas import DataFrame
from sklearn.model_selection import train_test_split

# Загрузка данных
df = pd.read_csv("..//static//csv//ds_salaries.csv")

# Создание целевого признака
median_salary = df['salary_in_usd'].median()
df['above_median_salary'] = np.where(df['salary_in_usd'] > median_salary, 1, 0)

# Разделение на признаки и целевую переменную
X = df.drop(columns=['salary_in_usd', 'above_median_salary'])
y = df['above_median_salary']

# Примерная категоризация
df['salary_category'] = pd.cut(df['salary_in_usd'], bins=[0, 100000, 200000, np.inf], labels=[0, 1, 2])

# Выбор признаков и целевых переменных
X = df.drop(columns=['salary_in_usd', 'salary_category'])

def split_stratified_into_train_val_test(
    df_input,
    stratify_colname="y",
    frac_train=0.6,
    frac_val=0.15,
    frac_test=0.25,
    random_state=None,
) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:
   
    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError(
            "fractions %f, %f, %f do not add up to 1.0"
            % (frac_train, frac_val, frac_test)
        )
    
    if stratify_colname not in df_input.columns:
        raise ValueError("%s is not a column in the dataframe" % (stratify_colname))
    X = df_input  # Contains all columns.
    y = df_input[
        [stratify_colname]
    ]  # Dataframe of just the column on which to stratify.
    
    # Split original dataframe into train and temp dataframes.
    df_train, df_temp, y_train, y_temp = train_test_split(
        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
    )

    if frac_val <= 0:
        assert len(df_input) == len(df_train) + len(df_temp)
        return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp
    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)

    df_val, df_test, y_val, y_test = train_test_split(
        df_temp,
        y_temp,
        stratify=y_temp,
        test_size=relative_frac_test,
        random_state=random_state,
    )

    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)
    return df_train, df_val, df_test, y_train, y_val, y_test

X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
    df, stratify_colname="above_median_salary", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=42
)

display("X_train", X_train)
display("y_train", y_train)

display("X_test", X_test)
display("y_test", y_test)

# Проверка преобразования
print(df.dtypes)

# Визуализация распределения зарплат
plt.figure(figsize=(10, 6))
sns.histplot(df['salary_in_usd'], bins=50, kde=True)
plt.title('Распределение зарплат')
plt.xlabel('Зарплата (USD)')
plt.ylabel('Частота')
plt.show()

# Визуализация зависимости между зарплатой и уровнем опыта
plt.figure(figsize=(10, 6))
sns.boxplot(x='experience_level', y='salary_in_usd', data=df)
plt.title('Зависимость зарплаты от уровня опыта')
plt.xlabel('Уровень опыта')
plt.ylabel('Зарплата (USD)')
plt.show()

'X_train'

	work_year	experience_level	employment_type	job_title	salary	salary_currency	salary_in_usd	employee_residence	remote_ratio	company_location	company_size	above_median_salary	salary_category
1809	2023	SE	FT	Data Engineer	182000	USD	182000	US	100	US	M	1	1
1082	2023	SE	FT	Machine Learning Engineer	126000	USD	126000	US	0	US	M	0	1
1686	2023	SE	FT	BI Developer	140000	USD	140000	US	100	US	M	1	1
1600	2023	SE	FT	Data Scientist	140000	USD	140000	US	0	US	M	1	1
1376	2023	SE	FT	Data Engineer	226700	USD	226700	US	0	US	M	1	2
...	...	...	...	...	...	...	...	...	...	...	...	...	...
2706	2022	SE	FT	Data Engineer	160000	USD	160000	US	100	US	M	1	1
928	2023	MI	FT	Data Engineer	200000	USD	200000	US	0	US	M	1	1
564	2023	MI	FT	Data Engineer	140000	USD	140000	US	0	US	M	1	1
716	2023	SE	FT	Data Scientist	297300	USD	297300	US	100	US	M	1	2
1299	2023	SE	FT	Data Engineer	133832	USD	133832	US	0	US	M	0	1

3004 rows × 13 columns

'y_train'

	above_median_salary
1809	1
1082	0
1686	1
1600	1
1376	1
...	...
2706	1
928	1
564	1
716	1
1299	0

3004 rows × 1 columns

'X_test'

	work_year	experience_level	employment_type	job_title	salary	salary_currency	salary_in_usd	employee_residence	remote_ratio	company_location	company_size	above_median_salary	salary_category
3459	2022	MI	FT	Research Scientist	59000	EUR	61989	AT	0	AT	L	0	0
3724	2021	EN	FT	Business Data Analyst	50000	EUR	59102	LU	100	LU	L	0	0
1795	2023	SE	FT	Data Engineer	180000	USD	180000	US	0	US	M	1	1
3535	2021	MI	FT	Data Scientist	50000	USD	50000	NG	100	NG	L	0	0
3255	2022	MI	FT	Data Analyst	106260	USD	106260	US	0	US	M	0	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...
1943	2022	MI	FT	Data Engineer	120000	USD	120000	US	100	US	M	0	1
573	2023	EN	FT	Autonomous Vehicle Technician	7000	USD	7000	GH	0	GH	S	0	0
3013	2022	SE	FT	Machine Learning Engineer	129300	USD	129300	US	0	US	M	0	1
327	2023	EN	FT	Data Scientist	70000	CAD	51753	CA	100	CA	L	0	0
1565	2023	SE	FT	Data Analyst	48000	EUR	51508	ES	0	ES	M	0	0

751 rows × 13 columns

'y_test'

	above_median_salary
3459	0
3724	0
1795	1
3535	0
3255	0
...	...
1943	0
573	0
3013	0
327	0
1565	0

751 rows × 1 columns

work_year                 int64
experience_level         object
employment_type          object
job_title                object
salary                    int64
salary_currency          object
salary_in_usd             int64
employee_residence       object
remote_ratio              int64
company_location         object
company_size             object
above_median_salary       int64
salary_category        category
dtype: object

No description has been provided for this image

In [ ]:

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Загрузка данных
df = pd.read_csv("..//static//csv//ds_salaries.csv")

# Создание целевого признака
median_salary = df['salary_in_usd'].median()
df['above_median_salary'] = np.where(df['salary_in_usd'] > median_salary, 1, 0)

# Разделение на признаки и целевую переменную
X = df.drop(columns=['salary_in_usd', 'above_median_salary'])
y = df['above_median_salary']

# Разделение данных на тренировочный и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Построение конвейеров предобработки

class SalaryFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        # Создание новых признаков
        X = X.copy()
        X["work_year_to_remote_ratio"] = X["work_year"] / X["remote_ratio"]
        return X
    def get_feature_names_out(self, features_in):
        # Добавление имен новых признаков
        new_features = ["work_year_to_remote_ratio"]
        return np.append(features_in, new_features, axis=0)

# Обработка числовых данных. Числовой конвейер: заполнение пропущенных значений медианой и стандартизация
preprocessing_num_class = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Обработка категориальных данных: заполнение пропущенных значений наиболее частым значением и one-hot encoding
preprocessing_cat_class = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Определение столбцов
numeric_columns = ["work_year", "salary", "salary_in_usd", "remote_ratio"]
cat_columns = ["experience_level", "employment_type", "job_title", "salary_currency", "employee_residence", "company_location", "company_size"]

# Предобработка признаков
features_preprocessing = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("prepocessing_num", preprocessing_num_class, numeric_columns),
        ("prepocessing_cat", preprocessing_cat_class, cat_columns),
    ],
    remainder="passthrough"
)

# Удаление колонок
columns_to_drop = []  # Укажите столбцы, которые нужно удалить, если они есть
drop_columns = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("drop_columns", "drop", columns_to_drop),
    ],
    remainder="passthrough",
)

# Основной конвейер предобработки данных и конструирования признаков
pipeline_end = Pipeline(
    [
        ("features_preprocessing", features_preprocessing),
        ("custom_features", SalaryFeatures()),
        ("drop_columns", drop_columns),
    ]
)

# Демонстрация работы конвейера для предобработки данных при классификации
preprocessing_result = pipeline_end.fit_transform(X_train)

# Получение имен столбцов после преобразования
feature_names = pipeline_end.named_steps['features_preprocessing'].get_feature_names_out(numeric_columns + cat_columns)
feature_names = np.append(feature_names, ["work_year_to_remote_ratio"])

# Создание DataFrame с преобразованными данными
preprocessed_df = pd.DataFrame(
    preprocessing_result,
    columns=feature_names,
)

preprocessed_df

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[14], line 71
     62 pipeline_end = Pipeline(
     63     [
     64         ("features_preprocessing", features_preprocessing),
   (...)
     67     ]
     68 )
     70 # Демонстрация работы конвейера для предобработки данных при классификации
---> 71 preprocessing_result = pipeline_end.fit_transform(X_train)
     72 preprocessed_df = pd.DataFrame(
     73     preprocessing_result,
     74     columns=pipeline_end.get_feature_names_out(),
     75 )
     77 preprocessed_df

File d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\sklearn\base.py:1473, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1466     estimator._validate_params()
   1468 with config_context(
   1469     skip_parameter_validation=(
   1470         prefer_skip_nested_validation or global_skip_validation
   1471     )
   1472 ):
-> 1473     return fit_method(estimator, *args, **kwargs)

File d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\sklearn\pipeline.py:533, in Pipeline.fit_transform(self, X, y, **params)
    490 """Fit the model and transform with the final estimator.
    491 
    492 Fit all the transformers one after the other and sequentially transform
   (...)
    530     Transformed samples.
    531 """
    532 routed_params = self._check_method_params(method="fit_transform", props=params)
--> 533 Xt = self._fit(X, y, routed_params)
    535 last_step = self._final_estimator
    536 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):

File d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\sklearn\pipeline.py:406, in Pipeline._fit(self, X, y, routed_params)
    404     cloned_transformer = clone(transformer)
    405 # Fit or load from cache the current transformer
--> 406 X, fitted_transformer = fit_transform_one_cached(
    407     cloned_transformer,
    408     X,
    409     y,
    410     None,
    411     message_clsname="Pipeline",
    412     message=self._log_message(step_idx),
    413     params=routed_params[name],
    414 )
    415 # Replace the transformer of the step with the fitted
    416 # transformer. This is necessary when loading the transformer
    417 # from the cache.
    418 self.steps[step_idx] = (name, fitted_transformer)

File d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\joblib\memory.py:312, in NotMemorizedFunc.__call__(self, *args, **kwargs)
    311 def __call__(self, *args, **kwargs):
--> 312     return self.func(*args, **kwargs)

File d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\sklearn\pipeline.py:1310, in _fit_transform_one(transformer, X, y, weight, message_clsname, message, params)
   1308 with _print_elapsed_time(message_clsname, message):
   1309     if hasattr(transformer, "fit_transform"):
-> 1310         res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
   1311     else:
   1312         res = transformer.fit(X, y, **params.get("fit", {})).transform(
   1313             X, **params.get("transform", {})
   1314         )

File d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\sklearn\utils\_set_output.py:316, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
    314 @wraps(f)
    315 def wrapped(self, X, *args, **kwargs):
--> 316     data_to_wrap = f(self, X, *args, **kwargs)
    317     if isinstance(data_to_wrap, tuple):
    318         # only wrap the first output for cross decomposition
    319         return_tuple = (
    320             _wrap_data_with_container(method, data_to_wrap[0], X, self),
    321             *data_to_wrap[1:],
    322         )

File d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\sklearn\base.py:1098, in TransformerMixin.fit_transform(self, X, y, **fit_params)
   1083         warnings.warn(
   1084             (
   1085                 f"This object ({self.__class__.__name__}) has a `transform`"
   (...)
   1093             UserWarning,
   1094         )
   1096 if y is None:
   1097     # fit method of arity 1 (unsupervised transformation)
-> 1098     return self.fit(X, **fit_params).transform(X)
   1099 else:
   1100     # fit method of arity 2 (supervised transformation)
   1101     return self.fit(X, y, **fit_params).transform(X)

File d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\sklearn\utils\_set_output.py:316, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
    314 @wraps(f)
    315 def wrapped(self, X, *args, **kwargs):
--> 316     data_to_wrap = f(self, X, *args, **kwargs)
    317     if isinstance(data_to_wrap, tuple):
    318         # only wrap the first output for cross decomposition
    319         return_tuple = (
    320             _wrap_data_with_container(method, data_to_wrap[0], X, self),
    321             *data_to_wrap[1:],
    322         )

Cell In[14], line 18, in SalaryFeatures.transform(self, X, y)
     15 def transform(self, X, y=None):
     16     # Создание новых признаков
     17     X = X.copy()
---> 18     X["work_year_to_remote_ratio"] = X["work_year"] / X["remote_ratio"]
     19     return X

File d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\scipy\sparse\_csr.py:24, in _csr_base.__getitem__(self, key)
     22 def __getitem__(self, key):
     23     if self.ndim == 2:
---> 24         return super().__getitem__(key)
     26     if isinstance(key, tuple) and len(key) == 1:
     27         key = key[0]

File d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\scipy\sparse\_index.py:52, in IndexMixin.__getitem__(self, key)
     51 def __getitem__(self, key):
---> 52     row, col = self._validate_indices(key)
     54     # Dispatch to specialized methods.
     55     if isinstance(row, INT_TYPES):

File d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\scipy\sparse\_index.py:186, in IndexMixin._validate_indices(self, key)
    184     row = _validate_bool_idx(bool_row, M, "row")
    185 elif not isinstance(row, slice):
--> 186     row = self._asindices(row, M)
    188 if isintlike(col):
    189     col = int(col)

File d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\scipy\sparse\_index.py:212, in IndexMixin._asindices(self, idx, length)
    209     raise IndexError('invalid index') from e
    211 if x.ndim not in (1, 2):
--> 212     raise IndexError('Index dimension must be 1 or 2')
    214 if x.size == 0:
    215     return x

IndexError: Index dimension must be 1 or 2

Бизнес-цели

Предсказание заработной платы (Регрессия)

Цель: Предсказать заработную плату (salary_in_usd) на основе других характеристик, таких как уровень опыта (experience_level), тип занятости (employment_type), должность (job_title), место проживания сотрудника (employee_residence), размер компании (company_size) и другие факторы.

Применение: Это может быть полезно для HR-отделов, которые хотят оценить справедливую зарплату для новых сотрудников или для анализа рынка труда.
Классификация уровня опыта по зарплате (Классификация)

Цель: Классифицировать уровень опыта (experience_level) на основе заработной платы (salary_in_usd) и других факторов.

Применение: Это может помочь в оценке, на каком уровне опыта находится сотрудник, основываясь на его зарплате, что может быть полезно для оценки карьерного роста.

Прогнозирование зарплаты

In [14]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
import matplotlib.pyplot as plt

# Загружаем набор данных
df = pd.read_csv("..//static//csv//ds_salaries.csv")

# Устанавливаем случайное состояние
random_state = 42

# Предварительный анализ данных
print(df.head())
print(df.info())
print(df.describe())

# Проверка на пропущенные значения
print(df.isnull().sum())

# Предобработка данных
# Определяем категориальные и числовые столбцы
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']

# Создаем пайплайн для обработки данных
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

# Определяем целевую переменную и признаки
X = df.drop('salary_in_usd', axis=1)
y = df['salary_in_usd']

# Разделяем данные на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Создаем и обучаем модель
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())])

model.fit(X_train, y_train)

# Делаем предсказания на тестовой выборке
y_pred = model.predict(X_test)

# Оцениваем качество модели
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

   work_year experience_level employment_type                 job_title  \
0       2023               SE              FT  Principal Data Scientist   
1       2023               MI              CT               ML Engineer   
2       2023               MI              CT               ML Engineer   
3       2023               SE              FT            Data Scientist   
4       2023               SE              FT            Data Scientist   

   salary salary_currency  salary_in_usd employee_residence  remote_ratio  \
0   80000             EUR          85847                 ES           100   
1   30000             USD          30000                 US           100   
2   25500             USD          25500                 US           100   
3  175000             USD         175000                 CA           100   
4  120000             USD         120000                 CA           100   

  company_location company_size  
0               ES            L  
1               US            S  
2               US            S  
3               CA            M  
4               CA            M  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           3755 non-null   int64 
 1   experience_level    3755 non-null   object
 2   employment_type     3755 non-null   object
 3   job_title           3755 non-null   object
 4   salary              3755 non-null   int64 
 5   salary_currency     3755 non-null   object
 6   salary_in_usd       3755 non-null   int64 
 7   employee_residence  3755 non-null   object
 8   remote_ratio        3755 non-null   int64 
 9   company_location    3755 non-null   object
 10  company_size        3755 non-null   object
dtypes: int64(4), object(7)
memory usage: 322.8+ KB
None
         work_year        salary  salary_in_usd  remote_ratio
count  3755.000000  3.755000e+03    3755.000000   3755.000000
mean   2022.373635  1.906956e+05  137570.389880     46.271638
std       0.691448  6.716765e+05   63055.625278     48.589050
min    2020.000000  6.000000e+03    5132.000000      0.000000
25%    2022.000000  1.000000e+05   95000.000000      0.000000
50%    2022.000000  1.380000e+05  135000.000000      0.000000
75%    2023.000000  1.800000e+05  175000.000000    100.000000
max    2023.000000  3.040000e+07  450000.000000    100.000000
work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64
Mean Squared Error: 2482079980.9527493
R^2 Score: 0.37127352660208646

Классифицировать уровень опыта

In [17]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt


# Загружаем набор данных
df = pd.read_csv("..//static//csv//ds_salaries.csv")

# Устанавливаем случайное состояние
random_state = 42


# Предобработка данных
# Определяем категориальные и числовые столбцы
categorical_features = ['employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
numeric_features = ['work_year', 'salary_in_usd', 'remote_ratio']

# Создаем пайплайн для обработки данных
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

# Определяем целевую переменную и признаки
X = df.drop('experience_level', axis=1)
y = df['experience_level']

# Разделяем данные на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Создаем и обучаем модель
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=random_state))])

model.fit(X_train, y_train)

# Делаем предсказания на тестовой выборке
y_pred = model.predict(X_test)

# Оцениваем качество модели
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print(f"Accuracy Score: {accuracy_score(y_test, y_pred)}")

# Визуализация результатов
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

Classification Report:
              precision    recall  f1-score   support

          EN       0.55      0.48      0.51        67
          EX       0.46      0.26      0.33        23
          MI       0.48      0.54      0.51       157
          SE       0.83      0.83      0.83       504

    accuracy                           0.72       751
   macro avg       0.58      0.53      0.55       751
weighted avg       0.72      0.72      0.72       751

Confusion Matrix:
[[ 32   0  20  15]
 [  0   6   5  12]
 [ 14   0  84  59]
 [ 12   7  65 420]]
Accuracy Score: 0.7217043941411452

Ориентир

In [21]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

df = pd.read_csv("..//static//csv//ds_salaries.csv")

# Предобработка данных
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

X = df.drop('salary_in_usd', axis=1)
y = df['salary_in_usd']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R²: {r2}")

# Проверяем, достигнуты ли ориентиры
if r2 >= 0.75 and mae <= 15000 and rmse <= 20000:
    print("Ориентиры для предсказания заработной платы достигнуты!")
else:
    print("Ориентиры для предсказания заработной платы не достигнуты.")

MAE: 37795.639591701794
MSE: 2482079980.9527493
RMSE: 49820.47752634201
R²: 0.37127352660208646
Ориентиры для предсказания заработной платы не достигнуты.

d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\sklearn\metrics\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
  warnings.warn(

In [23]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Загружаем набор данных
df = pd.read_csv("..//static//csv//ds_salaries.csv")

# Предобработка данных
categorical_features = ['employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
numeric_features = ['work_year', 'salary_in_usd', 'remote_ratio']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

X = df.drop('experience_level', axis=1)
y = df['experience_level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Проверяем, достигнуты ли ориентиры
if accuracy >= 0.80:
    print("Ориентиры для классификации уровня опыта достигнуты!")
else:
    print("Ориентиры для классификации уровня опыта не достигнуты.")

Accuracy: 0.7217043941411452
Classification Report:
              precision    recall  f1-score   support

          EN       0.55      0.48      0.51        67
          EX       0.46      0.26      0.33        23
          MI       0.48      0.54      0.51       157
          SE       0.83      0.83      0.83       504

    accuracy                           0.72       751
   macro avg       0.58      0.53      0.55       751
weighted avg       0.72      0.72      0.72       751

Confusion Matrix:
[[ 32   0  20  15]
 [  0   6   5  12]
 [ 14   0  84  59]
 [ 12   7  65 420]]
Ориентиры для классификации уровня опыта не достигнуты.

Конвейер

In [7]:

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

# Определение столбцов
numeric_columns = ["work_year", "salary", "salary_in_usd", "remote_ratio"]
cat_columns = ["experience_level", "employment_type", "job_title", "salary_currency", "employee_residence", "company_location", "company_size"]

# Обработка числовых данных: заполнение пропущенных значений медианой и стандартизация
preprocessing_num_class = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Обработка категориальных данных: заполнение пропущенных значений наиболее частым значением и one-hot encoding
preprocessing_cat_class = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Объединение всех преобразований в один ColumnTransformer
features_preprocessing = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("prepocessing_num", preprocessing_num_class, numeric_columns),
        ("prepocessing_cat", preprocessing_cat_class, cat_columns),
    ],
    remainder="passthrough"
)

# Определение конвейера
pipeline_end = Pipeline(
    [
        ("features_preprocessing", features_preprocessing),
    ]
)

In [5]:

# Разделение данных на тренировочный и тестовый наборы
X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)

# Применение конвейера для предобработки данных
preprocessing_result = pipeline_end.fit_transform(X_train)

# Получение имен столбцов после преобразования
feature_names = pipeline_end.named_steps['features_preprocessing'].get_feature_names_out()

# Создание DataFrame с преобразованными данными
preprocessed_df = pd.DataFrame(
    preprocessing_result,
    columns=feature_names,
)

# Вывод преобразованного DataFrame
print(preprocessed_df)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[5], line 2
      1 # Разделение данных на тренировочный и тестовый наборы
----> 2 X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)
      4 # Применение конвейера для предобработки данных
      5 preprocessing_result = pipeline_end.fit_transform(X_train)

NameError: name 'train_test_split' is not defined

201 KiB Raw Blame History Unescape Escape

201 KiB

Raw Blame History