78 lines
3.2 KiB
Python
78 lines
3.2 KiB
Python
import pandas as pd # 0 tabs
|
|
import numpy as np # 0 tabs
|
|
|
|
class DecisionTree: # 0 tabs
|
|
def __init__(self, max_depth=None): # 1 tab
|
|
self.max_depth = max_depth # 2 tabs
|
|
self.tree = {} # 2 tabs
|
|
|
|
def entropy(self, y): # 1 tab
|
|
classes, counts = np.unique(y, return_counts=True) # 2 tabs
|
|
probabilities = counts / len(y) # 2 tabs
|
|
entropy = sum(-p * np.log2(p) for p in probabilities) # 2 tabs
|
|
return entropy # 2 tabs
|
|
|
|
def information_gain(self, X, y, feature, threshold): # 1 tab
|
|
left_indices = X[:, feature] < threshold # 2 tabs
|
|
right_indices = ~left_indices # 2 tabs
|
|
left_entropy = self.entropy(y[left_indices]) # 2 tabs
|
|
right_entropy = self.entropy(y[right_indices]) # 2 tabs
|
|
left_weight = np.sum(left_indices) / len(y) # 2 tabs
|
|
right_weight = 1 - left_weight # 2 tabs
|
|
gain = self.entropy(y) - (left_weight * left_entropy + right_weight * right_entropy) # 2 tabs
|
|
return gain # 2 tabs
|
|
|
|
def best_split(self, X, y): # 1 tab
|
|
best_feature = None # 2 tabs
|
|
best_threshold = None # 2 tabs
|
|
best_gain = 0 # 2 tabs
|
|
for feature in range(X.shape[1]): # 2 tabs
|
|
thresholds = np.unique(X[:, feature]) # 3 tabs
|
|
for threshold in thresholds: # 3 tabs
|
|
gain = self.information_gain(X, y, feature, threshold) # 4 tabs
|
|
if gain > best_gain: # 4 tabs
|
|
best_gain = gain # 5 tabs
|
|
best_feature = feature # 5 tabs
|
|
best_threshold = threshold # 5 tabs
|
|
return best_feature, best_threshold # 2 tabs
|
|
|
|
def build_tree(self, X, y, depth=0): # 1 tab
|
|
if len(np.unique(y)) == 1 or (self.max_depth is not None and depth >= self.max_depth): # 2 tabs
|
|
return {'class': np.argmax(np.bincount(y))} # 3 tabs
|
|
best_feature, best_threshold = self.best_split(X, y) # 2 tabs
|
|
left_indices = X[:, best_feature] < best_threshold # 2 tabs
|
|
right_indices = ~left_indices # 2 tabs
|
|
tree = {'feature': best_feature, 'threshold': best_threshold} # 2 tabs
|
|
tree['left'] = self.build_tree(X[left_indices], y[left_indices], depth + 1) # 2 tabs
|
|
tree['right'] = self.build_tree(X[right_indices], y[right_indices], depth + 1) # 2 tabs
|
|
return tree # 2 tabs
|
|
|
|
def fit(self, X, y): # 1 tab
|
|
self.tree = self.build_tree(X, y) # 2 tabs
|
|
|
|
def predict_instance(self, tree, x): # 1 tab
|
|
if 'class' in tree: # 2 tabs
|
|
return tree['class'] # 3 tabs
|
|
if x[tree['feature']] < tree['threshold']: # 2 tabs
|
|
return self.predict_instance(tree['left'], x) # 3 tabs
|
|
else: # 2 tabs
|
|
return self.predict_instance(tree['right'], x) # 3 tabs
|
|
|
|
def predict(self, X): # 1 tab
|
|
return [self.predict_instance(self.tree, x) for x in X] # 2 tabs
|
|
|
|
# Пример использования
|
|
data = { # 0 tabs
|
|
'feature1': [1, 2, 3, 4, 5], # 1 tab
|
|
'feature2': [0, 0, 1, 1, 0], # 1 tab
|
|
'target': [0, 0, 1, 1, 1] # 1 tab
|
|
}
|
|
|
|
df = pd.DataFrame(data) # 0 tabs
|
|
X = df[['feature1', 'feature2']].values # 1 tab
|
|
y = df['target'].values # 1 tab
|
|
|
|
model = DecisionTree(max_depth=3) # 0 tabs
|
|
model.fit(X, y) # 1 tab
|
|
predictions = model.predict(X) # 1 tab
|
|
print(predictions) # 0 tabs |