ISEbd-31_Alimova_M.S._MAI/labs/lab1/lab1.ipynb

511 KiB
Raw Permalink Blame History

Лабораторная работа №1

Датасет 12. Цены на акции Starbucks.

  1. Загрузка и сохранение данных
In [22]:
import pandas as pd
In [62]:
df = pd.read_csv("coffee.csv")

df.head()
Out[62]:
Date Open High Low Close Adj Close Volume
0 1992-06-26 0.328125 0.347656 0.320313 0.335938 0.260703 224358400
1 1992-06-29 0.339844 0.367188 0.332031 0.359375 0.278891 58732800
2 1992-06-30 0.367188 0.371094 0.343750 0.347656 0.269797 34777600
3 1992-07-01 0.351563 0.359375 0.339844 0.355469 0.275860 18316800
4 1992-07-02 0.359375 0.359375 0.347656 0.355469 0.275860 13996800
In [63]:
df.tail(2)
Out[63]:
Date Open High Low Close Adj Close Volume
8034 2024-05-22 77.699997 81.019997 77.440002 80.720001 80.720001 22063400
8035 2024-05-23 80.099998 80.699997 79.169998 79.260002 79.260002 4651418
In [24]:
df.to_csv("newCoffee.csv", index=False)
  1. Получение сведений о датафрейме с данными
In [25]:
df.describe()
Out[25]:
Open High Low Close Adj Close Volume
count 8036.000000 8036.000000 8036.000000 8036.000000 8036.000000 8.036000e+03
mean 30.054280 30.351487 29.751322 30.058857 26.674025 1.470459e+07
std 33.615577 33.906613 33.314569 33.615911 31.728090 1.340021e+07
min 0.328125 0.347656 0.320313 0.335938 0.260703 1.504000e+06
25% 4.392031 4.531250 4.304922 4.399610 3.414300 7.817750e+06
50% 13.325000 13.493750 13.150000 13.330000 10.352452 1.169815e+07
75% 55.250000 55.722501 54.852499 55.267499 47.464829 1.778795e+07
max 126.080002 126.320000 124.809998 126.059998 118.010414 5.855088e+08
In [26]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8036 entries, 0 to 8035
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       8036 non-null   object 
 1   Open       8036 non-null   float64
 2   High       8036 non-null   float64
 3   Low        8036 non-null   float64
 4   Close      8036 non-null   float64
 5   Adj Close  8036 non-null   float64
 6   Volume     8036 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 439.6+ KB
  1. Получение сведений о колонках датафрейма
In [27]:
df.columns
Out[27]:
Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')
  1. Вывод отдельных строк и столбцов из датафрейма
In [28]:
df[["Open", "Close"]]
Out[28]:
Open Close
0 0.328125 0.335938
1 0.339844 0.359375
2 0.367188 0.347656
3 0.351563 0.355469
4 0.359375 0.355469
... ... ...
8031 75.269997 77.849998
8032 77.680000 77.540001
8033 77.559998 77.720001
8034 77.699997 80.720001
8035 80.099998 79.260002

8036 rows × 2 columns

In [29]:
df.iloc[5:10]
Out[29]:
Date Open High Low Close Adj Close Volume
5 1992-07-06 0.351563 0.355469 0.347656 0.355469 0.275860 5753600
6 1992-07-07 0.355469 0.355469 0.347656 0.355469 0.275860 10662400
7 1992-07-08 0.355469 0.355469 0.343750 0.347656 0.269797 15500800
8 1992-07-09 0.351563 0.359375 0.347656 0.359375 0.278891 3923200
9 1992-07-10 0.359375 0.367188 0.351563 0.363281 0.281923 11040000
In [30]:
df[df['Open'] > 120]
Out[30]:
Date Open High Low Close Adj Close Volume
7322 2021-07-23 124.550003 126.320000 123.919998 125.970001 117.926170 7934200
7323 2021-07-26 125.739998 126.099998 124.250000 126.059998 118.010414 4827500
7324 2021-07-27 126.080002 126.160004 124.809998 126.029999 117.982330 6110900
7325 2021-07-28 122.559998 123.330002 121.389999 122.410004 114.593483 11747000
7326 2021-07-29 122.930000 123.470001 122.139999 122.379997 114.565414 6618400
7327 2021-07-30 122.190002 122.980003 121.099998 121.430000 113.676071 5712300
7328 2021-08-02 122.029999 122.980003 120.070000 120.370003 112.683769 5996800
7329 2021-08-03 120.570000 120.750000 117.519997 119.129997 111.522942 6030500
  1. Группировка и агрегация данных в датафрейме
In [31]:
group = df.groupby(['High'])['Low'].mean()
group.to_frame()
Out[31]:
Low
High
0.347656 0.320313
0.355469 0.346354
0.359375 0.345052
0.367188 0.341797
0.371094 0.351562
... ...
123.330002 121.389999
123.470001 122.139999
126.099998 124.250000
126.160004 124.809998
126.320000 123.919998

5245 rows × 1 columns

  1. Сортировка данных в датафрейме
In [32]:
sorted_df = df.sort_values(by='Date', ascending = True)
sorted_df
Out[32]:
Date Open High Low Close Adj Close Volume
0 1992-06-26 0.328125 0.347656 0.320313 0.335938 0.260703 224358400
1 1992-06-29 0.339844 0.367188 0.332031 0.359375 0.278891 58732800
2 1992-06-30 0.367188 0.371094 0.343750 0.347656 0.269797 34777600
3 1992-07-01 0.351563 0.359375 0.339844 0.355469 0.275860 18316800
4 1992-07-02 0.359375 0.359375 0.347656 0.355469 0.275860 13996800
... ... ... ... ... ... ... ...
8031 2024-05-17 75.269997 78.000000 74.919998 77.849998 77.849998 14436500
8032 2024-05-20 77.680000 78.320000 76.709999 77.540001 77.540001 11183800
8033 2024-05-21 77.559998 78.220001 77.500000 77.720001 77.720001 8916600
8034 2024-05-22 77.699997 81.019997 77.440002 80.720001 80.720001 22063400
8035 2024-05-23 80.099998 80.699997 79.169998 79.260002 79.260002 4651418

8036 rows × 7 columns

  1. Удаление строк/столбцов
In [33]:
df_dropped_columns = df.drop(columns=['Adj Close', 'Volume'])  # Удаление столбцов 'Adj Close' и 'Volume'
In [34]:
df_dropped_columns
Out[34]:
Date Open High Low Close
0 1992-06-26 0.328125 0.347656 0.320313 0.335938
1 1992-06-29 0.339844 0.367188 0.332031 0.359375
2 1992-06-30 0.367188 0.371094 0.343750 0.347656
3 1992-07-01 0.351563 0.359375 0.339844 0.355469
4 1992-07-02 0.359375 0.359375 0.347656 0.355469
... ... ... ... ... ...
8031 2024-05-17 75.269997 78.000000 74.919998 77.849998
8032 2024-05-20 77.680000 78.320000 76.709999 77.540001
8033 2024-05-21 77.559998 78.220001 77.500000 77.720001
8034 2024-05-22 77.699997 81.019997 77.440002 80.720001
8035 2024-05-23 80.099998 80.699997 79.169998 79.260002

8036 rows × 5 columns

In [65]:
df_dropped_rows = df.drop([3, 4])  # Удаление строк с индексами 3 и 4
df_dropped_rows
Out[65]:
Date Open High Low Close Adj Close Volume
0 1992-06-26 0.328125 0.347656 0.320313 0.335938 0.260703 224358400
1 1992-06-29 0.339844 0.367188 0.332031 0.359375 0.278891 58732800
2 1992-06-30 0.367188 0.371094 0.343750 0.347656 0.269797 34777600
5 1992-07-06 0.351563 0.355469 0.347656 0.355469 0.275860 5753600
6 1992-07-07 0.355469 0.355469 0.347656 0.355469 0.275860 10662400
... ... ... ... ... ... ... ...
8031 2024-05-17 75.269997 78.000000 74.919998 77.849998 77.849998 14436500
8032 2024-05-20 77.680000 78.320000 76.709999 77.540001 77.540001 11183800
8033 2024-05-21 77.559998 78.220001 77.500000 77.720001 77.720001 8916600
8034 2024-05-22 77.699997 81.019997 77.440002 80.720001 80.720001 22063400
8035 2024-05-23 80.099998 80.699997 79.169998 79.260002 79.260002 4651418

8034 rows × 7 columns

  1. Создание новых столбцов на основе данных из существующих столбцов датафрейма
In [36]:
df['Difference'] = df['High'] - df['Low']
In [37]:
df
Out[37]:
Date Open High Low Close Adj Close Volume Difference
0 1992-06-26 0.328125 0.347656 0.320313 0.335938 0.260703 224358400 0.027343
1 1992-06-29 0.339844 0.367188 0.332031 0.359375 0.278891 58732800 0.035157
2 1992-06-30 0.367188 0.371094 0.343750 0.347656 0.269797 34777600 0.027344
3 1992-07-01 0.351563 0.359375 0.339844 0.355469 0.275860 18316800 0.019531
4 1992-07-02 0.359375 0.359375 0.347656 0.355469 0.275860 13996800 0.011719
... ... ... ... ... ... ... ... ...
8031 2024-05-17 75.269997 78.000000 74.919998 77.849998 77.849998 14436500 3.080002
8032 2024-05-20 77.680000 78.320000 76.709999 77.540001 77.540001 11183800 1.610001
8033 2024-05-21 77.559998 78.220001 77.500000 77.720001 77.720001 8916600 0.720001
8034 2024-05-22 77.699997 81.019997 77.440002 80.720001 80.720001 22063400 3.579995
8035 2024-05-23 80.099998 80.699997 79.169998 79.260002 79.260002 4651418 1.529999

8036 rows × 8 columns

  1. Удаление строк с пустыми значениями
In [38]:
print(df.isna().sum())
Date          0
Open          0
High          0
Low           0
Close         0
Adj Close     0
Volume        0
Difference    0
dtype: int64
In [39]:
df.dropna()
Out[39]:
Date Open High Low Close Adj Close Volume Difference
0 1992-06-26 0.328125 0.347656 0.320313 0.335938 0.260703 224358400 0.027343
1 1992-06-29 0.339844 0.367188 0.332031 0.359375 0.278891 58732800 0.035157
2 1992-06-30 0.367188 0.371094 0.343750 0.347656 0.269797 34777600 0.027344
3 1992-07-01 0.351563 0.359375 0.339844 0.355469 0.275860 18316800 0.019531
4 1992-07-02 0.359375 0.359375 0.347656 0.355469 0.275860 13996800 0.011719
... ... ... ... ... ... ... ... ...
8031 2024-05-17 75.269997 78.000000 74.919998 77.849998 77.849998 14436500 3.080002
8032 2024-05-20 77.680000 78.320000 76.709999 77.540001 77.540001 11183800 1.610001
8033 2024-05-21 77.559998 78.220001 77.500000 77.720001 77.720001 8916600 0.720001
8034 2024-05-22 77.699997 81.019997 77.440002 80.720001 80.720001 22063400 3.579995
8035 2024-05-23 80.099998 80.699997 79.169998 79.260002 79.260002 4651418 1.529999

8036 rows × 8 columns

  1. Заполнение пустых значений на основе существующих данных
In [ ]:
df.fillna(df.mean(), inplace=True)
df.fillna(df.median(), inplace=True)

Возможности визуализации

In [41]:
import matplotlib.pyplot as plt
In [43]:
#Линейная диаграмма
plt.figure(figsize=(10, 5))
df['High'].plot(title='Линейная диаграмма (столбец High)')
plt.show()
No description has been provided for this image
In [45]:
#Гистограмма
plt.figure(figsize=(8, 5))
df.plot.hist(column=["Open"], bins=80)
plt.show()
<Figure size 800x500 with 0 Axes>
No description has been provided for this image
In [47]:
#Столбчатая диаграмма
plt.figure(figsize=(40, 10))
df['Open'].value_counts().plot(kind='bar', title='Столбчатая диаграмма (Open)')
plt.show()
No description has been provided for this image
In [68]:
#Ящик с усами
plt.figure(figsize=(8, 20))
df["Volume"].plot(kind = "box", title='Ящик с усами')
plt.show()
No description has been provided for this image
In [161]:
#Диаграмма с областями
plt.figure(figsize=(8, 5))
df[['Open', 'High']].plot(kind='area', alpha=0.2, title='Area Plot (Open, High)')
plt.show()
<Figure size 800x500 with 0 Axes>
No description has been provided for this image
In [69]:
#Диаграмма рассеяния
df.plot.scatter(x="Open", y="Volume")
Out[69]:
<Axes: xlabel='Open', ylabel='Volume'>
No description has been provided for this image
In [72]:
#Круговая диаграмма

df['ForPieChart'] = df['Volume'] % 500 #Дополнительный столбец для демонстрации диаграммы

plt.figure(figsize=(8, 5))
df['ForPieChart'].value_counts().plot(kind='pie', autopct='%1.1f%%', title='Pie Chart (Volume)')
plt.show()
No description has been provided for this image
In [73]:
df['ForPieChart']
Out[73]:
0       400
1       300
2       100
3       300
4       300
       ... 
8031      0
8032    300
8033    100
8034    400
8035    418
Name: ForPieChart, Length: 8036, dtype: int64