IIS_2023_1/alexandrov_dmitrii_lab_3/lab3.py

127 lines
4.0 KiB
Python
Raw Normal View History

from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.pipeline import FeatureUnion, make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
import random as rand
import numpy as np
from matplotlib import pyplot as plt
def rank_to_dict(ranks, names, n_features):
ranks = np.abs(ranks)
minmax = MinMaxScaler()
ranks = minmax.fit_transform(np.array(ranks).reshape(len(ranks), 1)).ravel()
ranks = map(lambda x: round(x, 2), ranks)
return dict(zip(names, ranks))
def part_one():
print('Titanic data analysis\n')
data = pd.read_csv('titanic_data.csv', index_col='PassengerId')
x = data[['Pclass', 'Name', 'Sex']]
y = data[['Survived']]
names = pd.DataFrame(TfidfVectorizer().fit_transform(x['Name']).toarray())
col_names = names[names.columns[1:]].apply(lambda el: sum(el.dropna().astype(float)), axis=1)
col_names.index = np.arange(1, len(col_names) + 1)
col_sexes = []
for index, row in x.iterrows():
if row['Sex'] == 'male':
col_sexes.append(1)
else:
col_sexes.append(0)
x = x.drop(columns=['Sex', 'Name'])
x['Sex'] = col_sexes
x['Name'] = col_names
dtc = DecisionTreeClassifier(random_state=rand.randint(0, 250))
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=rand.randint(0, 250))
dtc.fit(x_train, y_train)
print('model score: ' + str(dtc.score(x_test, y_test)))
res = dict(zip(['Pclass', 'Sex', 'Name'], dtc.feature_importances_))
print('feature importances: ' + str(res))
def part_two():
print('\n---------------------------------------------------------------------------\nSberbank data analysis\n')
data = pd.read_csv('sberbank_data.csv', index_col='id')
x = data.drop(columns='price_doc')
y = data[['price_doc']]
x = x.replace(
['NA', 'no', 'yes', 'Investment', 'OwnerOccupier', 'poor', 'satisfactory', 'no data', 'good', 'excellent'],
[0, 0, 1, 0, 1, -1, 0, 0, 1, 2])
x.fillna(0, inplace=True)
names = pd.DataFrame(TfidfVectorizer().fit_transform(x['sub_area']).toarray())
col_area = names[names.columns[1:]].apply(lambda el: sum(el.dropna().astype(float)), axis=1)
col_area.index = np.arange(1, len(col_area) + 1)
col_date = []
for val in x['timestamp']:
col_date.append(val.split('-', 1)[0])
x = x.drop(columns=['sub_area', 'timestamp'])
x['sub_area'] = col_area
x['timestamp'] = col_date
col_price = []
for val in y['price_doc']:
if val < 1500000:
col_price.append('low')
elif val < 3000000:
col_price.append('medium')
elif val < 5500000:
col_price.append('high')
elif val < 10000000:
col_price.append('premium')
else:
col_price.append('oligarch')
y = pd.DataFrame(col_price)
transformer = FeatureUnion(
transformer_list=[
('features', SimpleImputer(strategy='mean')),
('indicators', MissingIndicator())])
dtr = make_pipeline(transformer, DecisionTreeClassifier(random_state=rand.randint(0, 250)))
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.01, random_state=rand.randint(0, 250))
dtr.fit(x_train, y_train)
features = list(x.columns)
print('model score: ' + str(dtr.score(x_test, y_test)))
res = sorted(dict(zip(features, dtr.steps[-1][1].feature_importances_)).items(),
key=lambda el: el[1], reverse=True)
view_y = []
view_x = []
flag = 0
print('feature importances:')
for val in res:
if flag == 8:
break
print(val[0]+" - "+str(val[1]))
view_y.append(val[0])
view_x.append(val[1])
flag = flag + 1
plt.figure(1, figsize=(16, 9))
plt.bar(view_y, view_x)
plt.show()
def start():
part_one()
part_two()
start()