196 KiB
196 KiB
Работа с NumPy
In [1]:
import numpy as np
matrix = np.array([[4, 5, 0], [9, 9, 9]])
print("matrix = \n", matrix, "\n")
tmatrix = matrix.T
print("tmatrix = \n", tmatrix, "\n")
vector = np.ravel(matrix)
print("vector = \n", vector, "\n")
tvector = np.reshape(vector, (6, 1))
print("tvector = \n", tvector, "\n")
list_matrix = list(matrix)
print("list_matrix = \n", list_matrix, "\n")
str_matrix = str(matrix)
print("matrix as str = \n", str_matrix, "\n")
print("matrix type is", type(matrix), "\n")
print("vector type is", type(vector), "\n")
print("list_matrix type is", type(list_matrix), "\n")
print("str_matrix type is", type(str_matrix), "\n")
formatted_vector = "; ".join(map(str, vector))
print("formatted_vector = \n", formatted_vector, "\n")
Работа с Pandas DataFrame
Работа с данными - чтение и запись CSV
In [3]:
import pandas as pd
df = pd.read_csv("data/healthcare-dataset-stroke-data.csv", index_col="id")
df.to_csv("test.csv")
Работа с данными - основные команды
In [20]:
df.info()
print(df.describe().transpose())
clear_df = df.drop(["heart_disease", "ever_married", "Residence_type"], axis=1)
print(clear_df.head())
print(clear_df.tail())
sorted_df = clear_df.sort_values(by="age")
print(sorted_df.head())
print(sorted_df.tail())
Работа с данными - работа с элементами
In [19]:
print(df["bmi"])
print(df.loc[9046])
print(df.loc[9046, "gender"])
print(df.loc[9046:53882, ["gender", "bmi"]])
print(df[0:3])
print(df.iloc[0])
print(df.iloc[3:5, 0:2])
print(df.iloc[[3, 4], [0, 1]])
Работа с данными - отбор и группировка
In [26]:
s_values = df["gender"].unique()
print(s_values)
s_total = 0
for s_value in s_values:
count = df[df["gender"] == s_value].shape[0]
s_total += count
print(s_value, "count =", count)
print("Total count = ", s_total)
print(df.groupby(["ever_married", "avg_glucose_level"]).size().reset_index(name="Count")) # type: ignore
Визуализация - Исходные данные
In [27]:
data = df[["gender", "age", "bmi"]].copy()
data.dropna(subset=["bmi"], inplace=True)
print(data)
In [28]:
def q1(x):
return x.quantile(0.25)
# median = quantile(0.5)
def q2(x):
return x.quantile(0.5)
def q3(x):
return x.quantile(0.75)
def iqr(x):
return q3(x) - q1(x)
def low_iqr(x):
return max(0, q1(x) - 1.5 * iqr(x))
def high_iqr(x):
return q3(x) + 1.5 * iqr(x)
quantiles = data[["gender", "age"]].groupby(["gender"]).aggregate(["min", q1, q2, "median", q3, "max"])
print(quantiles)
iqrs = data[["gender", "age"]].groupby(["gender"]).aggregate([low_iqr, iqr, high_iqr])
print(iqrs)
data.boxplot(column="age", by="gender")
Out[28]:
Визуализация - Гистограмма
In [29]:
data.plot.hist(column=["age"], bins=80)
Out[29]:
Визуализация - Точечная диаграмма
In [41]:
df.plot.scatter(x="age", y="gender")
df.plot.scatter(x="smoking_status", y="age")
Out[41]:
Визуализация - Столбчатая диаграмма
In [34]:
import matplotlib.pyplot as plt
plot = (
df.groupby(
["Residence_type", "ever_married"]
)
.size()
.unstack() # Преобразование таблицы для корректной визуализации
.plot.bar(
color=["pink", "green"], figsize=(10, 6)
)
)
plot.legend(["Never married", "Ever married"], title="Marital Status")
plot.set_title("Распределение по типу проживания и статусу брака", fontsize=16)
plot.set_xlabel("Тип проживания", fontsize=12)
plot.set_ylabel("Количество", fontsize=12)
# Показать диаграмму
plt.show()
Визуализация - Временные ряды
In [36]:
from datetime import datetime
import matplotlib.dates as md
ts = pd.read_csv("data/dollar.csv")
ts["date"] = ts.apply(lambda row: datetime.strptime(row["my_date"], "%d.%m.%Y"), axis=1)
ts.info()
print(ts)
plot = ts.plot.line(x="date", y="my_value")
plot.xaxis.set_major_locator(md.DayLocator(interval=10))
plot.xaxis.set_major_formatter(md.DateFormatter("%d.%m.%Y"))
plot.tick_params(axis="x", labelrotation=90)