2023-11-22 22:49:45 +04:00
|
|
|
|
import math
|
|
|
|
|
from functools import reduce
|
|
|
|
|
|
2023-11-23 16:12:33 +04:00
|
|
|
|
from LabWork01.LabWork6.ConvertorDataFrame import CovertorDataFrame
|
2023-11-22 22:49:45 +04:00
|
|
|
|
|
2023-11-23 00:57:08 +04:00
|
|
|
|
# дата-сет
|
2023-11-23 16:12:33 +04:00
|
|
|
|
dfMain = CovertorDataFrame()[0]
|
|
|
|
|
dfTest = CovertorDataFrame()[1]
|
2023-11-22 22:49:45 +04:00
|
|
|
|
|
|
|
|
|
cstr = lambda s: [k + ":" + str(v) for k, v in sorted(s.value_counts().items())]
|
|
|
|
|
|
|
|
|
|
# Структура данных Decision Tree
|
|
|
|
|
tree = {
|
|
|
|
|
# name: Название этого нода (узла)
|
2023-11-23 16:12:33 +04:00
|
|
|
|
"name": "decision tree " + dfMain.columns[-1] + " " + str(cstr(dfMain.iloc[:, -1])),
|
2023-11-22 22:49:45 +04:00
|
|
|
|
# df: Данные, связанные с этим нодом (узлом)
|
2023-11-23 16:12:33 +04:00
|
|
|
|
"df": dfMain,
|
2023-11-23 00:57:08 +04:00
|
|
|
|
# edges: Список ребер (ветвей), выходящих из этого узла, или пустой массив, если ниже нет листового узла.
|
2023-11-22 22:49:45 +04:00
|
|
|
|
"edges": [],
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Генерацию дерева, у узлов которого могут быть ветви, сохраняем в open
|
|
|
|
|
open = [tree]
|
|
|
|
|
|
|
|
|
|
# Лямба-выражение для вычесления энтропии.
|
|
|
|
|
# Аргумент - pandas.Series、возвращаемое значение - число энтропии
|
|
|
|
|
entropy = lambda s: -reduce(lambda x, y: x + y, map(lambda x: (x / len(s)) * math.log2(x / len(s)), s.value_counts()))
|
|
|
|
|
|
|
|
|
|
# Зацикливаем, пока open не станет пустым
|
|
|
|
|
while (len(open) != 0):
|
2023-11-23 00:57:08 +04:00
|
|
|
|
|
2023-11-22 22:49:45 +04:00
|
|
|
|
n = open.pop(0)
|
|
|
|
|
df_n = n["df"]
|
|
|
|
|
|
|
|
|
|
if 0 == entropy(df_n.iloc[:, -1]):
|
|
|
|
|
continue
|
2023-11-23 00:57:08 +04:00
|
|
|
|
|
2023-11-22 22:49:45 +04:00
|
|
|
|
attrs = {}
|
2023-11-23 00:57:08 +04:00
|
|
|
|
|
2023-11-22 22:49:45 +04:00
|
|
|
|
for attr in df_n.columns[:-1]:
|
2023-11-23 00:57:08 +04:00
|
|
|
|
|
2023-11-22 22:49:45 +04:00
|
|
|
|
attrs[attr] = {"entropy": 0, "dfs": [], "values": []}
|
2023-11-23 00:57:08 +04:00
|
|
|
|
|
2023-11-22 22:49:45 +04:00
|
|
|
|
for value in sorted(set(df_n[attr])):
|
|
|
|
|
df_m = df_n.query(attr + "=='" + value + "'")
|
2023-11-23 00:57:08 +04:00
|
|
|
|
|
2023-11-22 22:49:45 +04:00
|
|
|
|
attrs[attr]["entropy"] += entropy(df_m.iloc[:, -1]) * df_m.shape[0] / df_n.shape[0]
|
|
|
|
|
attrs[attr]["dfs"] += [df_m]
|
|
|
|
|
attrs[attr]["values"] += [value]
|
|
|
|
|
pass
|
|
|
|
|
pass
|
2023-11-23 00:57:08 +04:00
|
|
|
|
|
2023-11-22 22:49:45 +04:00
|
|
|
|
if len(attrs) == 0:
|
|
|
|
|
continue
|
2023-11-23 00:57:08 +04:00
|
|
|
|
|
2023-11-22 22:49:45 +04:00
|
|
|
|
attr = min(attrs, key=lambda x: attrs[x]["entropy"])
|
2023-11-23 00:57:08 +04:00
|
|
|
|
|
2023-11-22 22:49:45 +04:00
|
|
|
|
for d, v in zip(attrs[attr]["dfs"], attrs[attr]["values"]):
|
|
|
|
|
m = {"name": attr + "=" + v, "edges": [], "df": d.drop(columns=attr)}
|
|
|
|
|
n["edges"].append(m)
|
|
|
|
|
open.append(m)
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
# Выводим дата сет
|
2023-11-23 16:12:33 +04:00
|
|
|
|
print(dfMain, "\n-------------")
|
|
|
|
|
|
|
|
|
|
# оценка тестовых данных
|
|
|
|
|
def predict_bp(nodes, target) -> int:
|
|
|
|
|
overlap = None
|
|
|
|
|
for node in nodes:
|
|
|
|
|
check: bool = node["value"] == target[node["attr"]]
|
|
|
|
|
if check:
|
|
|
|
|
overlap = node
|
|
|
|
|
break
|
|
|
|
|
if overlap is None:
|
|
|
|
|
overlap = nodes[-1]
|
|
|
|
|
if len(overlap["edges"]) == 0:
|
|
|
|
|
return int(overlap["df"]["StoreSales"].mean())
|
|
|
|
|
else:
|
|
|
|
|
return predict_bp(overlap["edges"], target)
|
|
|
|
|
|
|
|
|
|
def predict_str(count: int):
|
|
|
|
|
predictions = []
|
|
|
|
|
for i in range(count):
|
|
|
|
|
row = dfTest.iloc[i]
|
|
|
|
|
prediction = f"{ {'Age': row['Age'], 'BMI': row['BMI']} }" + \
|
|
|
|
|
f"<br/>predict {predict_bp(tree['edges'], {'Age': row['Age'], 'BMI': row['BMI']})} / fact {row['BloodPressure']}"
|
|
|
|
|
predictions.append(prediction)
|
|
|
|
|
return '<br/>'.join(predictions)
|
2023-11-22 22:49:45 +04:00
|
|
|
|
|
|
|
|
|
def tstr(tree, indent=""):
|
|
|
|
|
s = indent + tree["name"] + str(cstr(tree["df"].iloc[:, -1]) if len(tree["edges"]) == 0 else "") + "\n"
|
|
|
|
|
# Зацикливаем все ветви этого узла.
|
|
|
|
|
for e in tree["edges"]:
|
2023-11-23 16:12:33 +04:00
|
|
|
|
s += tstr(e, "\t" + indent + " ")
|
2023-11-22 22:49:45 +04:00
|
|
|
|
pass
|
|
|
|
|
return s
|
|
|
|
|
|
2023-11-23 00:57:08 +04:00
|
|
|
|
def getStringTree():
|
|
|
|
|
return tstr(tree)
|
2023-11-22 22:49:45 +04:00
|
|
|
|
|
|
|
|
|
# Выводим древо в его символьном представлении.
|
|
|
|
|
print(tstr(tree))
|