MII_Salin_Oleg_PIbd-33/lec5.ipynb
gg12 darfren f49b209552 done
2024-11-20 19:19:56 +04:00

2.9 MiB
Raw Permalink Blame History

Загрузка данных

In [129]:
from sklearn import datasets
import pandas as pd
import numpy as np


X, y = datasets.make_classification(
    n_samples=144, 
    n_features=5,
    n_informative=3, 
    n_classes=4,
)
print(X)
df = pd.DataFrame(X)  # type: ignore
df.columns = ["X1", "X2", "X3", "X4", "X5"]
df['y'] = y

y_names = ['class' + str(i) for i in range(5)]
display(df.head())
display(df.tail())

display(y)
[[-2.32401186e+00  7.05074756e-01  4.48535225e-01  7.70090520e-01
  -1.16617999e+00]
 [-1.63988490e+00 -2.21973399e+00 -1.04704769e+00 -1.58925732e+00
  -1.51443302e+00]
 [-1.13037684e+00 -2.22988519e+00 -1.94326817e+00 -1.42554953e-01
   2.10224064e+00]
 [ 8.89767155e-01 -1.99946166e-02 -5.92572962e-01  8.44169856e-01
   2.59653162e+00]
 [ 2.52863614e+00 -5.86695065e-01  3.37245492e-01 -1.96432890e+00
  -1.49138457e+00]
 [-9.47696580e-01  7.07797661e-01 -9.33993509e-02  1.48477196e+00
   1.49224073e+00]
 [-2.44791559e+00 -1.22007013e+00 -6.33679067e-01 -5.20124031e-01
  -1.26493805e+00]
 [-2.60738014e-01 -6.97849748e-01 -9.82892815e-02 -9.39816888e-01
  -1.25008567e+00]
 [ 2.75252550e+00 -9.43712510e-03  5.24284409e-01 -1.37074995e+00
  -8.57686558e-01]
 [-3.67259514e-01  7.63618140e-01  7.07417207e-01  9.98208513e-02
  -1.19947859e+00]
 [ 3.36701785e+00  8.88176188e-01  1.10099581e+00 -9.84728456e-01
  -8.51415934e-01]
 [ 6.64819182e-01  3.03843068e+00  1.80593569e+00  1.79060735e+00
  -1.93589802e-02]
 [ 4.61145798e-01  1.01080810e+00  7.82473809e-01  2.42854908e-01
  -5.98775837e-01]
 [ 1.94269425e+00 -3.01272578e+00 -1.28541457e+00 -3.07312169e+00
  -8.09046024e-01]
 [ 7.53183627e-01 -1.38299848e+00 -4.94669889e-01 -1.55270510e+00
  -7.94455067e-01]
 [ 1.66835300e+00 -2.24532878e+00 -5.78017577e-01 -2.98222798e+00
  -1.96090718e+00]
 [-3.23311391e-01 -1.92013284e+00 -8.07199059e-01 -1.72397519e+00
  -1.22268471e+00]
 [ 3.87002898e-01 -1.72388946e+00 -1.45822798e+00 -3.92560220e-01
   1.99062094e+00]
 [ 1.29180739e+00 -6.66698890e-01 -4.89292656e-01 -4.67528892e-01
   9.68194767e-01]
 [ 3.00903015e+00 -5.62690331e-01  4.05331998e-01 -2.12079964e+00
  -1.49594730e+00]
 [-1.05578756e+00 -1.52753087e+00 -7.01896645e-01 -1.13772773e+00
  -1.08257843e+00]
 [ 3.85933835e-02  2.72219392e+00  1.73929616e+00  1.48597851e+00
  -7.16527769e-01]
 [ 1.65725033e+00  2.68970754e-01 -5.13933472e-02  2.61643933e-01
   1.48183567e+00]
 [-1.11207161e+00 -2.30290435e+00 -7.26655967e-01 -2.36588668e+00
  -2.69526342e+00]
 [ 5.61837545e-01  2.66700802e-01  7.30978713e-01 -9.13221264e-01
  -1.97501003e+00]
 [-1.35492363e-01  7.02465614e-01  4.36856228e-01  4.27915114e-01
  -2.00752796e-01]
 [ 2.47328934e+00 -3.47444499e+00 -1.45007006e+00 -3.63801255e+00
  -9.57984984e-01]
 [-1.34159831e+00 -6.26203386e-01 -9.21340952e-01  7.75816671e-01
   1.59145461e+00]
 [-7.99516309e-01 -1.91004018e-01 -1.34443068e-01  4.74966951e-02
  -2.39666613e-01]
 [-1.67702569e+00  5.89636027e-01  1.12756226e-01  1.05319626e+00
   1.39555930e-01]
 [-4.71637541e-01  1.06921441e+00  1.27433073e-01  1.62243064e+00
   1.63500568e+00]
 [-1.59204136e+00 -1.62432795e+00 -1.79256496e+00  6.72608794e-01
   2.64593841e+00]
 [-9.41729071e-02 -1.28770715e-01 -1.02190057e+00  1.56654050e+00
   3.58359742e+00]
 [-1.44225275e+00  8.90625263e-01  2.03200368e-01  1.35560587e+00
   5.46984283e-01]
 [ 3.62538500e-01 -1.78842111e+00 -1.46613316e+00 -4.81452484e-01
   1.87035172e+00]
 [ 1.54884285e+00 -9.65672411e-02 -9.79541287e-02 -2.43744164e-01
   8.20152259e-01]
 [-2.49857121e+00 -2.26679455e+00 -1.13773822e+00 -1.36995184e+00
  -1.63373925e+00]
 [ 3.83263661e-01  5.57447514e-01 -1.94520234e-01  1.19334466e+00
   2.11548115e+00]
 [-1.02259420e-01  1.68088463e+00  7.11452032e-01  1.56386210e+00
   8.88784737e-01]
 [-2.40919500e+00  2.58034721e-04 -5.83804388e-01  1.40215680e+00
   1.21043305e+00]
 [-9.16088134e-01 -1.25825735e+00 -1.13167611e+00  2.57433578e-02
   1.20268179e+00]
 [-6.72912927e-01 -5.13438246e-01 -1.74146645e-01 -4.37010449e-01
  -7.34468017e-01]
 [-1.24940354e+00 -8.33029422e-01 -1.05365534e+00  6.47641045e-01
   1.68680226e+00]
 [ 1.08781107e+00 -6.59236358e-01  3.54211521e-01 -1.87766488e+00
  -2.32363703e+00]
 [ 1.24169036e+00 -7.60775616e-01 -3.73392153e-01 -8.14681743e-01
   3.00057482e-01]
 [-2.98242427e+00 -7.31481416e-02  8.39818910e-03  3.52643415e-01
  -1.45346481e+00]
 [-1.49955191e+00 -4.52175218e-01 -1.01024935e+00  1.24241729e+00
   2.24199765e+00]
 [-1.64372400e+00  1.21437485e+00  8.25868474e-01  8.48376946e-01
  -1.21362382e+00]
 [ 4.82815115e-01 -1.60125809e+00 -6.97300730e-01 -1.51908316e+00
  -6.08851929e-01]
 [ 3.69196732e-01  1.39287818e+00  1.26603432e+00  5.40827949e-02
  -1.65509196e+00]
 [-2.33136489e+00 -6.21017288e-01 -2.15423676e-01 -2.72066365e-01
  -1.51246813e+00]
 [-1.55290874e+00 -6.55141194e-01 -5.45462670e-01  1.13871401e-01
   3.54445022e-03]
 [-1.70867065e+00 -2.09139706e+00 -2.30818816e+00  8.10511888e-01
   3.55187277e+00]
 [ 1.58229686e+00 -7.12483411e-01  3.91380708e-02 -1.50267249e+00
  -1.02651792e+00]
 [ 5.73921945e-01 -9.73172493e-01 -1.97728693e-01 -1.35927833e+00
  -1.11470025e+00]
 [ 1.65983137e+00  5.47758784e-01  1.98195614e-01  2.90712698e-01
   1.13493019e+00]
 [ 9.21364254e-01 -1.06342762e+00 -1.04442434e+00 -1.03786828e-01
   2.07029521e+00]
 [ 3.27190962e+00  1.84630675e-01  9.57879270e-01 -1.88367583e+00
  -1.87253732e+00]
 [ 1.17618176e+00 -1.23443978e+00 -3.15780257e-01 -1.68539285e+00
  -9.76095704e-01]
 [ 5.48339224e-01  2.63682216e+00  1.70610234e+00  1.31893406e+00
  -5.59304052e-01]
 [ 1.08449640e+00 -1.11130225e+00 -9.45159546e-01 -3.80836414e-01
   1.65628285e+00]
 [-1.65570361e-01 -2.65300684e+00 -2.19185538e+00 -5.69896795e-01
   2.54148813e+00]
 [-3.05246009e+00 -6.96843213e-01 -5.34818987e-01  2.71968474e-01
  -7.62466037e-01]
 [-1.40237494e+00  1.12322945e+00  7.74338118e-01  7.47355739e-01
  -1.11252365e+00]
 [-8.93686047e-01 -5.47012333e-01 -4.09857502e-01 -4.95024138e-02
  -5.50505067e-04]
 [ 3.30465394e+00 -8.32078650e-01 -5.48925999e-01 -9.66925184e-01
   1.68901590e+00]
 [ 1.37030563e+00 -5.56716982e-01 -3.49002855e-01 -5.40926713e-01
   7.04387324e-01]
 [ 8.66418291e-01  9.31841174e-01  3.04543386e-01  8.71059388e-01
   1.22674989e+00]
 [ 2.07959188e+00 -3.11611817e-02  4.99785502e-01 -1.25424504e+00
  -1.09615990e+00]
 [ 4.38734763e-01  1.01911030e+00  2.50845305e-01  1.17778079e+00
   1.44026007e+00]
 [-7.84260783e-01  4.64261795e-01 -6.74854194e-01  2.05952052e+00
   3.25349034e+00]
 [ 1.13823897e+00 -2.23127395e+00 -6.13983375e-01 -2.81022722e+00
  -2.01744288e+00]
 [ 1.09352824e+00 -1.81798150e-01  3.84529148e-01 -1.14270174e+00
  -1.40043942e+00]
 [ 7.93459861e-01 -1.31465500e+00 -8.04000422e-01 -9.12580041e-01
   5.52431795e-01]
 [ 5.73059507e-01 -4.79265988e+00 -2.46694576e+00 -3.74839519e+00
  -7.40571087e-01]
 [-2.66876373e-01  2.81947710e+00  1.59694025e+00  1.94225646e+00
  -9.09190168e-02]
 [-1.42090057e+00  1.30261908e-01 -3.93148823e-01  1.12605075e+00
   1.18288094e+00]
 [ 5.80935274e-01 -1.66944383e+00 -1.18161788e+00 -8.11808714e-01
   1.13444490e+00]
 [-1.91694566e+00  5.59871952e-02  1.50956034e-01  1.45489974e-01
  -1.26652345e+00]
 [-2.45626305e-01 -1.79881951e-01 -3.03596982e-02 -2.04394049e-01
  -3.78571542e-01]
 [-7.61341420e-01  4.56966196e-01  3.75352565e-01  2.31143996e-01
  -7.63799261e-01]
 [ 1.87532049e+00 -4.84389287e-02  2.73092483e-01 -8.58096335e-01
  -3.54318040e-01]
 [-1.02560771e+00  1.59381412e+00  3.04852423e-01  2.27292824e+00
   1.86181950e+00]
 [ 6.55133420e-01 -1.45394216e+00 -1.28615744e+00 -2.87737560e-01
   2.03301200e+00]
 [-1.09764135e+00  9.54341127e-01  5.98205807e-01  7.22670118e-01
  -6.77475144e-01]
 [-2.06545779e+00  1.02300229e+00  2.82286533e-01  1.53960758e+00
   2.68501701e-01]
 [ 5.98430922e-01 -1.25715913e+00 -1.15810243e+00 -1.74597264e-01
   1.94715336e+00]
 [ 2.19317944e+00  6.53583615e-01  4.88711783e-01 -1.23238237e-01
   4.80879262e-01]
 [-1.99112180e+00  1.39225582e+00  4.50467983e-01  1.84679500e+00
   4.59310992e-01]
 [-2.28506035e+00  4.29493275e-01  1.86820620e-01  7.60449188e-01
  -7.48417786e-01]
 [ 1.64928678e-01  3.17378349e+00  1.77659211e+00  2.14649065e+00
   1.74958852e-01]
 [-1.26063140e+00 -1.58256204e+00 -1.56003412e+00  2.86014172e-01
   1.98878889e+00]
 [-6.51202394e-01 -1.17946939e+00 -2.34913275e-01 -1.43526440e+00
  -1.93921423e+00]
 [ 4.07206333e-01 -5.86604679e-01 -2.83093246e-01 -5.46469347e-01
  -1.99542756e-02]
 [ 6.26050241e-01 -5.81468747e-01 -2.88410427e-01 -5.64614305e-01
   1.04149941e-01]
 [-3.02807572e+00 -1.80180468e+00 -8.11852158e-01 -1.07811225e+00
  -2.09304062e+00]
 [-6.22496112e-01  8.20149679e-01  8.27459556e-01  2.77215026e-02
  -1.64328274e+00]
 [-5.62435594e-01  2.86710137e+00  1.42271503e+00  2.36993314e+00
   5.52791599e-01]
 [ 1.59443272e+00 -2.72456995e-02  5.16717405e-02 -3.94999873e-01
   4.18477064e-01]
 [-2.59329155e+00  2.41577012e+00  5.67006074e-01  3.43393527e+00
   1.98144945e+00]
 [-1.19257193e+00  3.31165872e-02 -5.62117555e-01  1.21994436e+00
   1.71398590e+00]
 [-9.12678493e-01  8.76975437e-01  7.90712908e-01  2.32431456e-01
  -1.50240317e+00]
 [-3.80285801e-01  1.01492530e+00 -8.92773445e-02  1.89188465e+00
   2.38351295e+00]
 [ 9.80994864e-01  3.98631172e-01  6.22629648e-01 -5.76979510e-01
  -1.09730105e+00]
 [ 1.12293931e+00 -5.96546154e-02 -5.36985599e-01  6.44608332e-01
   2.39682611e+00]
 [-1.68336645e+00 -1.34947106e-01 -5.77738556e-01  1.04973131e+00
   1.20099103e+00]
 [ 3.44644261e+00 -9.06047771e-02  1.01155175e+00 -2.45926174e+00
  -2.60121428e+00]
 [ 1.55635768e+00 -2.42226510e-01  5.53666297e-02 -7.50069826e-01
  -7.85014197e-02]
 [ 9.34177543e-02  8.43197721e-01  7.77867381e-01  3.42471378e-02
  -1.10075669e+00]
 [ 1.84076164e+00 -1.28653158e+00 -3.05988484e-02 -2.37231664e+00
  -1.89707040e+00]
 [-8.24426464e-01  4.95525305e-01 -2.35201187e-01  1.35889573e+00
   1.62515550e+00]
 [-1.37337974e+00 -4.27759235e-01  7.03784303e-02 -6.03001032e-01
  -1.77892144e+00]
 [-1.80630266e-01  9.06001500e-01  9.80680415e-01 -1.67276018e-01
  -1.85504051e+00]
 [-6.81671337e-01  6.53554194e-01  7.22599881e-01 -5.66575952e-02
  -1.62953117e+00]
 [ 3.34476851e-01  3.13051414e-01 -4.47382987e-01  1.23426435e+00
   2.52999859e+00]
 [-3.03854933e-01  8.00584586e-01  1.13519922e+00 -5.87843504e-01
  -2.72624640e+00]
 [ 1.51574132e+00  1.40061832e+00  9.69059527e-01  3.91852912e-01
  -1.86468780e-02]
 [-2.31468266e-01  1.61622069e+00  1.02342668e+00  9.39796900e-01
  -4.97860979e-01]
 [-4.56684606e-01  1.01440959e+00  1.06868065e+00 -9.50219712e-02
  -2.07264636e+00]
 [-1.46692019e+00 -1.20243214e+00 -1.59523052e+00  1.00808170e+00
   2.86119265e+00]
 [ 6.05369271e-02  1.85555246e+00  5.48226964e-01  2.10731187e+00
   1.96034708e+00]
 [-7.69696067e-01  8.68764996e-01  7.21703517e-01  3.14590958e-01
  -1.19612494e+00]
 [-3.54774167e-01  6.71641610e-01  6.20182736e-01  9.64906137e-02
  -1.06070173e+00]
 [-1.10309302e+00 -5.69665127e-01 -7.77939549e-01  5.82639965e-01
   1.26746774e+00]
 [ 2.24413787e+00 -2.24917026e+00 -5.48991553e-01 -3.13286090e+00
  -1.83635720e+00]
 [-1.85867893e+00  5.28156290e-01  1.10377843e-01  9.85518550e-01
  -6.17404072e-02]
 [-8.35303619e-01  8.94105746e-01  4.00333659e-01  9.21821654e-01
   5.85563526e-02]
 [-6.80563819e-01  2.26687673e+00  1.44779726e+00  1.35502965e+00
  -8.96170990e-01]
 [-7.67910130e-01  9.72109045e-01  1.52462414e-01  1.46738401e+00
   1.20315838e+00]
 [-2.86833951e-01  6.86835495e-01  3.17994439e-01  6.32015985e-01
   1.55223606e-01]
 [-5.64907969e-01 -1.19546231e+00 -6.54276407e-01 -7.51993892e-01
  -3.35692143e-01]
 [ 4.13141684e+00 -7.47917442e-01  6.82256144e-01 -3.08816482e+00
  -2.48062866e+00]
 [ 1.58725355e+00  2.17687628e+00  1.56422235e+00  6.34504829e-01
  -5.76190261e-01]
 [ 1.10869323e+00  2.64110459e+00  8.24962970e-01  2.75511863e+00
   3.05263101e+00]
 [-1.49702457e+00  5.24036105e-01 -9.39573409e-02  1.27222053e+00
   8.62852973e-01]
 [ 2.09568939e+00 -1.66461385e+00 -2.12654949e-01 -2.72399819e+00
  -1.91466502e+00]
 [ 1.87232229e-01  6.88821300e-01 -5.27238705e-01  2.01655338e+00
   3.58850346e+00]
 [ 2.73593477e+00 -4.25241487e-01  3.99117606e-01 -1.83847717e+00
  -1.28940701e+00]
 [-5.85530900e-02  1.64278513e+00  9.33853402e-01  1.10996287e+00
  -2.48693018e-02]
 [ 1.70380541e+00  1.80920644e-01  1.72343023e-01 -2.77491184e-01
   4.55840310e-01]
 [-1.82481260e+00 -1.39620162e+00 -4.86021218e-01 -1.16768939e+00
  -1.94751913e+00]
 [ 5.22411828e-02  1.84085437e+00  5.19920027e-01  2.13325900e+00
   2.03303927e+00]
 [ 6.53251865e-01 -1.47619962e+00 -1.06995257e+00 -6.97329890e-01
   1.15812059e+00]
 [-1.59092913e+00 -2.72335754e+00 -2.66660440e+00  3.66518950e-01
   3.59859238e+00]]
X1 X2 X3 X4 X5 y
0 -2.324012 0.705075 0.448535 0.770091 -1.166180 1
1 -1.639885 -2.219734 -1.047048 -1.589257 -1.514433 0
2 -1.130377 -2.229885 -1.943268 -0.142555 2.102241 1
3 0.889767 -0.019995 -0.592573 0.844170 2.596532 3
4 2.528636 -0.586695 0.337245 -1.964329 -1.491385 2
X1 X2 X3 X4 X5 y
139 1.703805 0.180921 0.172343 -0.277491 0.455840 3
140 -1.824813 -1.396202 -0.486021 -1.167689 -1.947519 0
141 0.052241 1.840854 0.519920 2.133259 2.033039 1
142 0.653252 -1.476200 -1.069953 -0.697330 1.158121 3
143 -1.590929 -2.723358 -2.666604 0.366519 3.598592 1
array([1, 0, 1, 3, 2, 1, 0, 1, 2, 0, 3, 2, 2, 2, 2, 0, 0, 3, 3, 2, 0, 3,
       3, 0, 0, 0, 2, 1, 1, 1, 1, 1, 3, 0, 3, 3, 0, 3, 3, 0, 1, 1, 1, 2,
       3, 0, 1, 0, 2, 2, 0, 0, 1, 2, 2, 3, 0, 2, 2, 2, 3, 3, 1, 0, 1, 3,
       3, 3, 2, 3, 3, 2, 2, 3, 2, 2, 2, 3, 0, 2, 1, 3, 3, 3, 0, 0, 3, 3,
       0, 0, 1, 1, 0, 2, 2, 0, 0, 2, 3, 3, 1, 0, 1, 2, 3, 1, 2, 3, 0, 2,
       1, 0, 0, 1, 3, 0, 3, 2, 0, 1, 1, 0, 0, 1, 2, 1, 0, 2, 1, 0, 1, 2,
       2, 1, 1, 2, 1, 2, 2, 3, 0, 1, 3, 1])

Визуализация данных с учетом понимания из особенностей

In [130]:
from visual import draw_data_2d
import matplotlib.pyplot as plt

plt.figure(figsize=(16, 12))
draw_data_2d(df, 0, 1, y.tolist(), y_names, plt.subplot(3, 2, 1))
draw_data_2d(df, 2, 3, y.tolist(), y_names, plt.subplot(3, 2, 2))
draw_data_2d(df, 0, 2, y.tolist(), y_names, plt.subplot(3, 2, 3))
draw_data_2d(df, 1, 3, y.tolist(), y_names, plt.subplot(3, 2, 4))
draw_data_2d(df, 1, 2, y.tolist(), y_names, plt.subplot(3, 2, 5))
draw_data_2d(df, 0, 3, y.tolist(), y_names, plt.subplot(3, 2, 6))
No description has been provided for this image

Визуализация данных без понимания их особенностей

In [131]:
plt.figure(figsize=(16, 12))
draw_data_2d(df, 0, 1, subplot=plt.subplot(3, 2, 1))
draw_data_2d(df, 2, 3, subplot=plt.subplot(3, 2, 2))
draw_data_2d(df, 0, 2, subplot=plt.subplot(3, 2, 3))
draw_data_2d(df, 1, 3, subplot=plt.subplot(3, 2, 4))
draw_data_2d(df, 1, 2, subplot=plt.subplot(3, 2, 5))
draw_data_2d(df, 0, 3, subplot=plt.subplot(3, 2, 6))
No description has been provided for this image

Иерархическая агломеративная кластеризация

Также формируется дендрограмма

In [132]:
from utils_clusters import get_linkage_matrix, run_agglomerative
from visual import draw_dendrogram
from scipy.cluster import hierarchy

tree = run_agglomerative(df)
linkage_matrix = get_linkage_matrix(tree)
draw_dendrogram(linkage_matrix)
No description has been provided for this image

Получение результатов иерархической кластеризации

Также производится сравнение с реальным разбиением

https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/

In [133]:
result = hierarchy.fcluster(linkage_matrix, 10, criterion="distance")
display(result)
display(y)
# result = [0 if val == 1 else 1 if val == 3 else 2 for val in result]

plt.figure(figsize=(16, 24))
draw_data_2d(df, 0, 1, result, y_names, plt.subplot(4, 2, 1))
draw_data_2d(df, 0, 1, y.tolist(), y_names, plt.subplot(4, 2, 2))
draw_data_2d(df, 2, 3, result, y_names, plt.subplot(4, 2, 3))
draw_data_2d(df, 2, 3, y.tolist(), y_names, plt.subplot(4, 2, 4))
draw_data_2d(df, 0, 2, result, y_names, plt.subplot(4, 2, 5))
draw_data_2d(df, 0, 2, y.tolist(), y_names, plt.subplot(4, 2, 6))
draw_data_2d(df, 1, 3, result, y_names, plt.subplot(4, 2, 7))
draw_data_2d(df, 1, 3, y.tolist(), y_names, plt.subplot(4, 2, 8))
array([4, 4, 6, 7, 2, 7, 4, 4, 2, 3, 2, 5, 5, 1, 2, 1, 4, 6, 2, 2, 4, 5,
       2, 4, 3, 3, 1, 7, 4, 7, 7, 6, 7, 7, 6, 2, 4, 7, 5, 7, 7, 4, 7, 2,
       2, 4, 7, 3, 2, 5, 4, 4, 6, 2, 2, 2, 7, 2, 2, 5, 6, 6, 4, 3, 4, 2,
       2, 5, 2, 5, 7, 1, 2, 6, 1, 5, 7, 6, 4, 2, 3, 2, 7, 6, 3, 7, 6, 2,
       7, 4, 5, 6, 4, 2, 2, 4, 3, 5, 2, 7, 7, 3, 7, 2, 7, 7, 2, 2, 3, 1,
       7, 4, 3, 3, 7, 3, 5, 5, 3, 6, 7, 3, 3, 7, 1, 7, 3, 5, 7, 3, 4, 2,
       5, 7, 7, 1, 7, 2, 5, 2, 4, 7, 6, 6], dtype=int32)
array([1, 0, 1, 3, 2, 1, 0, 1, 2, 0, 3, 2, 2, 2, 2, 0, 0, 3, 3, 2, 0, 3,
       3, 0, 0, 0, 2, 1, 1, 1, 1, 1, 3, 0, 3, 3, 0, 3, 3, 0, 1, 1, 1, 2,
       3, 0, 1, 0, 2, 2, 0, 0, 1, 2, 2, 3, 0, 2, 2, 2, 3, 3, 1, 0, 1, 3,
       3, 3, 2, 3, 3, 2, 2, 3, 2, 2, 2, 3, 0, 2, 1, 3, 3, 3, 0, 0, 3, 3,
       0, 0, 1, 1, 0, 2, 2, 0, 0, 2, 3, 3, 1, 0, 1, 2, 3, 1, 2, 3, 0, 2,
       1, 0, 0, 1, 3, 0, 3, 2, 0, 1, 1, 0, 0, 1, 2, 1, 0, 2, 1, 0, 1, 2,
       2, 1, 1, 2, 1, 2, 2, 3, 0, 1, 3, 1])
No description has been provided for this image

Неиерархическая четка кластеризация (k-means)

In [134]:
from utils_clusters import print_cluster_result, run_kmeans

random_state = 9

labels, centers = run_kmeans(df, 2, random_state)
print_cluster_result(df, 2, labels)
display(centers)
display(y)
Cluster 1 (57):
1, 4, 7, 8, 10, 13, 14, 15, 16, 17, 18, 19, 20, 23, 24, 26, 34, 35, 36, 43, 44, 48, 53, 54, 57, 58, 60, 61, 65, 66, 68, 71, 72, 73, 74, 77, 81, 83, 86, 87, 92, 93, 94, 95, 98, 103, 106, 107, 109, 124, 130, 131, 135, 137, 139, 140, 142

--------
Cluster 2 (87):
0, 2, 3, 5, 6, 9, 11, 12, 21, 22, 25, 27, 28, 29, 30, 31, 32, 33, 37, 38, 39, 40, 41, 42, 45, 46, 47, 49, 50, 51, 52, 55, 56, 59, 62, 63, 64, 67, 69, 70, 75, 76, 78, 79, 80, 82, 84, 85, 88, 89, 90, 91, 96, 97, 99, 100, 101, 102, 104, 105, 108, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 132, 133, 134, 136, 138, 141, 143

--------
array([[ 1.05781172, -1.10417484, -0.35961761, -1.37530793, -0.57595436,
         1.96491228],
       [-0.69486012,  0.55944142,  0.10909332,  0.84899378,  0.50377892,
         1.17241379]])
array([1, 0, 1, 3, 2, 1, 0, 1, 2, 0, 3, 2, 2, 2, 2, 0, 0, 3, 3, 2, 0, 3,
       3, 0, 0, 0, 2, 1, 1, 1, 1, 1, 3, 0, 3, 3, 0, 3, 3, 0, 1, 1, 1, 2,
       3, 0, 1, 0, 2, 2, 0, 0, 1, 2, 2, 3, 0, 2, 2, 2, 3, 3, 1, 0, 1, 3,
       3, 3, 2, 3, 3, 2, 2, 3, 2, 2, 2, 3, 0, 2, 1, 3, 3, 3, 0, 0, 3, 3,
       0, 0, 1, 1, 0, 2, 2, 0, 0, 2, 3, 3, 1, 0, 1, 2, 3, 1, 2, 3, 0, 2,
       1, 0, 0, 1, 3, 0, 3, 2, 0, 1, 1, 0, 0, 1, 2, 1, 0, 2, 1, 0, 1, 2,
       2, 1, 1, 2, 1, 2, 2, 3, 0, 1, 3, 1])

Визуализация результатов кластеризации

In [135]:
from visual import draw_cluster_results

plt.figure(figsize=(16, 12))
draw_cluster_results(df, 0, 1, labels, centers, plt.subplot(2, 2, 1))
draw_cluster_results(df, 2, 3, labels, centers, plt.subplot(2, 2, 2))
draw_cluster_results(df, 0, 2, labels, centers, plt.subplot(2, 2, 3))
draw_cluster_results(df, 1, 3, labels, centers, plt.subplot(2, 2, 4))
No description has been provided for this image

Разбиение на 4 кластера и сравнение с реальным разбиением

In [136]:
labels, centers = run_kmeans(df, 4, random_state)

plt.figure(figsize=(16, 24))
draw_data_2d(df, 0, 1, labels.tolist(), y_names, plt.subplot(4, 2, 1))
draw_data_2d(df, 0, 1, y.tolist(), y_names, plt.subplot(4, 2, 2))
draw_data_2d(df, 2, 3, labels.tolist(), y_names, plt.subplot(4, 2, 3))
draw_data_2d(df, 2, 3, y.tolist(), y_names, plt.subplot(4, 2, 4))
draw_data_2d(df, 0, 2, labels.tolist(), y_names, plt.subplot(4, 2, 5))
draw_data_2d(df, 0, 2, y.tolist(), y_names, plt.subplot(4, 2, 6))
draw_data_2d(df, 1, 3, labels.tolist(), y_names, plt.subplot(4, 2, 7))
draw_data_2d(df, 1, 3, y.tolist(), y_names, plt.subplot(4, 2, 8))
No description has been provided for this image

Понижение размерности до n=2

In [137]:
from sklearn.decomposition import PCA


reduced_data = PCA(n_components=2).fit_transform(df)
reduced_data
Out[137]:
array([[ 1.32593942e+00, -2.18749592e+00],
       [-1.97826650e+00, -1.10701677e+00],
       [ 2.71639984e-01,  2.31820884e+00],
       [ 1.25744596e+00,  2.81369781e+00],
       [-3.22217535e+00, -1.60031442e-01],
       [ 2.40214727e+00,  2.46806378e-01],
       [-3.69674519e-01, -1.65368049e+00],
       [-1.32384707e+00, -9.90035169e-01],
       [-2.37151218e+00,  7.58074265e-02],
       [ 1.81753342e-01, -2.21238708e+00],
       [-1.99480240e+00,  1.54103339e-01],
       [ 2.35227119e+00, -1.44073541e+00],
       [ 1.99596367e-01, -8.24024797e-01],
       [-4.57633500e+00,  1.59002642e+00],
       [-2.32120217e+00,  4.17385129e-01],
       [-4.41726824e+00, -6.68615754e-01],
       [-2.32198366e+00, -7.31514607e-01],
       [-4.29087530e-01,  3.15934054e+00],
       [-7.79068193e-01,  1.97363655e+00],
       [-3.50689785e+00, -6.38732621e-02],
       [-1.39728127e+00, -1.01048460e+00],
       [ 1.88621795e+00, -1.53988950e+00],
       [ 2.22856117e-01,  1.94858265e+00],
       [-3.25080635e+00, -1.87463074e+00],
       [-1.42709881e+00, -2.33401710e+00],
       [ 7.09268385e-01, -1.33053092e+00],
       [-5.44332864e+00,  1.85279709e+00],
       [ 1.50112132e+00,  9.72993095e-01],
       [ 2.19271277e-01, -5.95430684e-01],
       [ 1.76586839e+00, -9.33884019e-01],
       [ 2.53699446e+00,  2.71779314e-01],
       [ 1.51540328e+00,  2.30545107e+00],
       [ 2.50055734e+00,  3.42649564e+00],
       [ 2.25340896e+00, -1.14859403e+00],
       [-5.60663777e-01,  3.09384448e+00],
       [-5.27198952e-01,  1.60023774e+00],
       [-1.56629594e+00, -1.37784438e+00],
       [ 1.75483269e+00,  1.99194756e+00],
       [ 2.19429274e+00,  2.79893301e-01],
       [ 2.53312842e+00, -3.55236486e-01],
       [ 3.69138767e-01,  1.11500796e+00],
       [-5.17740459e-01, -7.79793582e-01],
       [ 1.32307501e+00,  1.18518882e+00],
       [-2.98696830e+00, -1.12012356e+00],
       [-1.32192214e+00,  1.48328012e+00],
       [ 8.80981444e-01, -2.56560118e+00],
       [ 2.23577569e+00,  1.35602887e+00],
       [ 1.39055086e+00, -2.77422805e+00],
       [-2.21479277e+00,  6.22261556e-01],
       [-1.64262467e-01, -1.89675831e+00],
       [-7.52957875e-02, -2.15523014e+00],
       [ 5.09791723e-01, -7.48741253e-01],
       [ 1.82319793e+00,  3.25677676e+00],
       [-2.40004461e+00,  5.62354370e-02],
       [-2.06653462e+00, -1.06587490e-01],
       [ 2.23134983e-01,  1.51894414e+00],
       [ 8.35958260e-02,  1.70207914e+00],
       [-3.26340013e+00, -7.15754258e-01],
       [-2.58496993e+00,  2.90260937e-01],
       [ 1.66323595e+00, -1.67426284e+00],
       [-5.52632938e-01,  2.71809702e+00],
       [-5.28838999e-01,  3.98232108e+00],
       [ 7.96501888e-01, -1.26050462e+00],
       [ 1.22850749e+00, -2.58652708e+00],
       [ 1.27301782e-01, -2.30138121e-01],
       [-1.67496052e+00,  3.11188314e+00],
       [-9.20672211e-01,  1.72153704e+00],
       [ 1.14322455e+00,  1.19199664e+00],
       [-2.14072759e+00, -2.60008050e-01],
       [ 1.64932337e+00,  1.21249551e+00],
       [ 3.23848407e+00,  2.67277957e+00],
       [-4.23543458e+00,  7.15700353e-03],
       [-1.87840480e+00, -6.50297891e-01],
       [-1.36190448e+00,  1.88552701e+00],
       [-5.29221524e+00,  2.30948117e+00],
       [ 2.68774500e+00, -1.59545623e+00],
       [ 1.88346626e+00,  6.36714570e-01],
       [-1.12740736e+00,  2.49466713e+00],
       [ 4.63048602e-01, -2.23789570e+00],
       [-2.84107113e-01, -1.54896111e-01],
       [ 4.05813974e-01, -1.36229837e+00],
       [-1.54763720e+00,  7.17429763e-01],
       [ 3.40877222e+00,  8.83768163e-01],
       [-3.20095470e-01,  3.10637348e+00],
       [ 1.19926669e+00, -2.07549088e+00],
       [ 2.56474512e+00, -1.58998023e+00],
       [-1.66730583e-01,  2.91564356e+00],
       [-4.94736393e-01,  1.06623807e+00],
       [ 2.99558541e+00, -1.62576887e+00],
       [ 1.41640869e+00, -2.12021519e+00],
       [ 2.99625518e+00, -1.90580317e+00],
       [ 8.64102148e-01,  1.83828088e+00],
       [-1.95994093e+00, -1.78882831e+00],
       [-8.06960737e-01,  5.09330916e-01],
       [-8.49760003e-01,  6.56589855e-01],
       [-1.14152403e+00, -2.12534375e+00],
       [ 7.03879551e-02, -2.65638914e+00],
       [ 3.38848855e+00, -1.18011529e+00],
       [-7.86036044e-01,  1.25343042e+00],
       [ 5.23936833e+00,  1.53688725e-01],
       [ 2.10066904e+00,  7.39620232e-01],
       [ 4.08674973e-01, -2.64432373e+00],
       [ 2.97660341e+00,  9.18001854e-01],
       [-1.05396335e+00, -7.58011870e-01],
       [ 9.27898191e-01,  2.73195679e+00],
       [ 1.88170910e+00,  3.09803354e-01],
       [-4.15836121e+00, -1.10080396e+00],
       [-1.32218416e+00,  9.68951593e-01],
       [ 3.58888407e-02, -2.06965739e+00],
       [-3.72549122e+00, -2.55973339e-01],
       [ 2.22694149e+00,  4.99182288e-01],
       [-6.97951691e-01, -2.24857318e+00],
       [-2.84762293e-01, -2.76820542e+00],
       [-9.70900037e-02, -2.14032524e+00],
       [ 1.86466006e+00,  2.44533611e+00],
       [-9.40249095e-01, -3.43079482e+00],
       [ 2.52489015e-01,  9.54766411e-02],
       [ 1.26547237e+00, -1.24281113e+00],
       [-1.70132052e-01, -3.06620379e+00],
       [ 1.98027096e+00,  2.27183117e+00],
       [ 3.15917813e+00,  2.19249127e-01],
       [ 5.34696684e-01, -2.36280599e+00],
       [ 1.91018239e-01, -2.04817767e+00],
       [ 1.16551796e+00,  7.40410345e-01],
       [-4.81789328e+00,  4.21301137e-01],
       [ 1.67713941e+00, -1.10244667e+00],
       [ 1.51632690e+00, -1.39562399e+00],
       [ 1.85415809e+00, -2.02593818e+00],
       [ 2.31951123e+00, -8.71471407e-02],
       [ 1.05068741e+00, -1.07471134e+00],
       [-9.19418638e-01, -5.91973296e-02],
       [-5.10508391e+00, -4.79067414e-01],
       [ 5.73359235e-01, -1.18790144e+00],
       [ 4.01530110e+00,  8.97874118e-01],
       [ 2.12070426e+00, -2.80541551e-01],
       [-4.24576720e+00, -1.69481555e-04],
       [ 3.20006458e+00,  2.18975297e+00],
       [-3.05754194e+00, -4.03559153e-02],
       [ 1.52629544e+00, -8.41106533e-01],
       [-6.37066079e-01,  1.19336895e+00],
       [-1.42342604e+00, -1.95148797e+00],
       [ 3.20407657e+00,  2.83195472e-01],
       [-9.78711507e-01,  2.42328725e+00],
       [ 1.20394346e+00,  3.67222510e+00]])

Визуализация данных после понижения размерности

In [138]:
plt.figure(figsize=(16, 6))
draw_data_2d(
    pd.DataFrame({"Column1": reduced_data[:, 0], "Column2": reduced_data[:, 1]}),
    0,
    1,
    subplot=plt.subplot(1, 2, 1),
)
draw_data_2d(
    pd.DataFrame({"Column1": reduced_data[:, 0], "Column2": reduced_data[:, 1]}),
    0,
    1,
    y.tolist(),
    y_names,
    plt.subplot(1, 2, 2),
)
No description has been provided for this image

Визуализация результатов неиерархической кластеризации для двух кластеров с учетом понижения размерности

In [139]:
from utils_clusters import fit_kmeans
from visual import draw_clusters


kmeans = fit_kmeans(reduced_data, 2, random_state)
draw_clusters(reduced_data, kmeans)
No description has been provided for this image

Визуализация результатов неиерархической кластеризации для четырех кластеров с учетом понижения размерности

In [140]:
kmeans = fit_kmeans(reduced_data, 4, random_state)
draw_clusters(reduced_data, kmeans)
No description has been provided for this image

Сравнение результатов кластеризации с реальным разбиением с учетом понижения размерности

In [141]:
labels = [2 if val == 1 else 1 if val == 2 else val for val in kmeans.labels_]

plt.figure(figsize=(16, 12))
draw_data_2d(
    pd.DataFrame({"Column1": reduced_data[:, 0], "Column2": reduced_data[:, 1]}),
    0,
    1,
    labels,
    y_names,
    plt.subplot(2, 2, 1),
)
draw_data_2d(
    pd.DataFrame({"Column1": reduced_data[:, 0], "Column2": reduced_data[:, 1]}),
    0,
    1,
    result,
    y_names,
    plt.subplot(2, 2, 2),
)
draw_data_2d(
    pd.DataFrame({"Column1": reduced_data[:, 0], "Column2": reduced_data[:, 1]}),
    0,
    1,
    y.tolist(),
    y_names,
    plt.subplot(2, 2, 3),
)
No description has been provided for this image

Выбор количества кластеров на основе инерции

Инерция -- сумма квадратов расстояний выборок до ближайшего центра кластера, взвешенная по весам выборок, если таковые имеются.

In [142]:
from utils_clusters import get_clusters_inertia
from visual import draw_elbow_diagram


inertias, clusters_range = get_clusters_inertia(df, random_state)
display(clusters_range)
display(inertias)
draw_elbow_diagram(inertias, clusters_range)
range(2, 13)
[1095.0175884953487,
 840.435967632246,
 690.9568840077667,
 569.8045875559183,
 489.9561312217587,
 420.18478011974236,
 399.30137712803855,
 343.2316060605482,
 312.7806842989379,
 284.2906232030758,
 268.29293041255244]
No description has been provided for this image

Выбор количества кластеров на основе коэффициента силуэта

Коэффициент силуэта рассчитывается с использованием среднего расстояния внутри кластера (а) и среднего расстояния до ближайшего кластера (b) для каждого образца. Коэффициент силуэта для образца равен (b - a) / max(a, b). Для пояснения: b — это расстояние между образцом и ближайшим кластером, частью которого образец не является. Обратите внимание, что коэффициент силуэта определяется только в том случае, если количество меток равно 2 <= n_labels <= n_samples - 1.

Эта функция возвращает средний коэффициент силуэта по всем образцам.

Лучшее значение — 1, худшее — -1. Значения около 0 указывают на перекрывающиеся кластеры. Отрицательные значения обычно указывают на то, что образец был отнесен к неправильному кластеру.

In [143]:
from utils_clusters import get_clusters_silhouette_scores
from visual import draw_silhouettes_diagram

silhouette_scores, clusters_range = get_clusters_silhouette_scores(df, random_state)
display(clusters_range)
display(silhouette_scores)
draw_silhouettes_diagram(silhouette_scores, clusters_range)
range(2, 13)
[0.27024619065262834,
 0.2704234654274893,
 0.2619942652399716,
 0.2946089738043157,
 0.29442550478815743,
 0.3239316140412729,
 0.2955578664142471,
 0.3109257191360407,
 0.31515029418107016,
 0.3282942559953307,
 0.31900504693790716]
No description has been provided for this image

Пример анализа силуэтов для разбиения от 2 до 12 кластеров

max_clusters = int(math.sqrt(len(df)))

https://scikit-learn.org/1.5/auto_examples/cluster/plot_kmeans_silhouette_analysis.html

In [144]:
from utils_clusters import get_clusters_silhouettes
from visual import draw_silhouettes


silhouettes = get_clusters_silhouettes(reduced_data, random_state)
draw_silhouettes(reduced_data, silhouettes)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image