127 lines
4.0 KiB
Python
127 lines
4.0 KiB
Python
from sklearn.impute import SimpleImputer, MissingIndicator
|
|
from sklearn.pipeline import FeatureUnion, make_pipeline
|
|
from sklearn.preprocessing import MinMaxScaler
|
|
from sklearn.tree import DecisionTreeClassifier
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.model_selection import train_test_split
|
|
import pandas as pd
|
|
import random as rand
|
|
import numpy as np
|
|
from matplotlib import pyplot as plt
|
|
|
|
|
|
def rank_to_dict(ranks, names, n_features):
|
|
ranks = np.abs(ranks)
|
|
minmax = MinMaxScaler()
|
|
ranks = minmax.fit_transform(np.array(ranks).reshape(len(ranks), 1)).ravel()
|
|
ranks = map(lambda x: round(x, 2), ranks)
|
|
return dict(zip(names, ranks))
|
|
|
|
|
|
def part_one():
|
|
print('Titanic data analysis\n')
|
|
data = pd.read_csv('titanic_data.csv', index_col='PassengerId')
|
|
x = data[['Pclass', 'Name', 'Sex']]
|
|
y = data[['Survived']]
|
|
|
|
names = pd.DataFrame(TfidfVectorizer().fit_transform(x['Name']).toarray())
|
|
col_names = names[names.columns[1:]].apply(lambda el: sum(el.dropna().astype(float)), axis=1)
|
|
col_names.index = np.arange(1, len(col_names) + 1)
|
|
col_sexes = []
|
|
|
|
for index, row in x.iterrows():
|
|
if row['Sex'] == 'male':
|
|
col_sexes.append(1)
|
|
else:
|
|
col_sexes.append(0)
|
|
|
|
x = x.drop(columns=['Sex', 'Name'])
|
|
x['Sex'] = col_sexes
|
|
x['Name'] = col_names
|
|
|
|
dtc = DecisionTreeClassifier(random_state=rand.randint(0, 250))
|
|
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=rand.randint(0, 250))
|
|
dtc.fit(x_train, y_train)
|
|
print('model score: ' + str(dtc.score(x_test, y_test)))
|
|
res = dict(zip(['Pclass', 'Sex', 'Name'], dtc.feature_importances_))
|
|
print('feature importances: ' + str(res))
|
|
|
|
|
|
def part_two():
|
|
print('\n---------------------------------------------------------------------------\nSberbank data analysis\n')
|
|
data = pd.read_csv('sberbank_data.csv', index_col='id')
|
|
x = data.drop(columns='price_doc')
|
|
y = data[['price_doc']]
|
|
|
|
x = x.replace(
|
|
['NA', 'no', 'yes', 'Investment', 'OwnerOccupier', 'poor', 'satisfactory', 'no data', 'good', 'excellent'],
|
|
[0, 0, 1, 0, 1, -1, 0, 0, 1, 2])
|
|
x.fillna(0, inplace=True)
|
|
|
|
names = pd.DataFrame(TfidfVectorizer().fit_transform(x['sub_area']).toarray())
|
|
col_area = names[names.columns[1:]].apply(lambda el: sum(el.dropna().astype(float)), axis=1)
|
|
col_area.index = np.arange(1, len(col_area) + 1)
|
|
col_date = []
|
|
|
|
for val in x['timestamp']:
|
|
col_date.append(val.split('-', 1)[0])
|
|
|
|
x = x.drop(columns=['sub_area', 'timestamp'])
|
|
x['sub_area'] = col_area
|
|
x['timestamp'] = col_date
|
|
|
|
col_price = []
|
|
for val in y['price_doc']:
|
|
if val < 1500000:
|
|
col_price.append('low')
|
|
elif val < 3000000:
|
|
col_price.append('medium')
|
|
elif val < 5500000:
|
|
col_price.append('high')
|
|
elif val < 10000000:
|
|
col_price.append('premium')
|
|
else:
|
|
col_price.append('oligarch')
|
|
|
|
y = pd.DataFrame(col_price)
|
|
|
|
transformer = FeatureUnion(
|
|
transformer_list=[
|
|
('features', SimpleImputer(strategy='mean')),
|
|
('indicators', MissingIndicator())])
|
|
|
|
dtr = make_pipeline(transformer, DecisionTreeClassifier(random_state=rand.randint(0, 250)))
|
|
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.01, random_state=rand.randint(0, 250))
|
|
dtr.fit(x_train, y_train)
|
|
|
|
features = list(x.columns)
|
|
print('model score: ' + str(dtr.score(x_test, y_test)))
|
|
|
|
res = sorted(dict(zip(features, dtr.steps[-1][1].feature_importances_)).items(),
|
|
key=lambda el: el[1], reverse=True)
|
|
|
|
view_y = []
|
|
view_x = []
|
|
|
|
flag = 0
|
|
print('feature importances:')
|
|
for val in res:
|
|
if flag == 8:
|
|
break
|
|
print(val[0]+" - "+str(val[1]))
|
|
view_y.append(val[0])
|
|
view_x.append(val[1])
|
|
flag = flag + 1
|
|
|
|
plt.figure(1, figsize=(16, 9))
|
|
plt.bar(view_y, view_x)
|
|
plt.show()
|
|
|
|
|
|
def start():
|
|
part_one()
|
|
part_two()
|
|
|
|
|
|
start()
|