Compare commits
45 Commits
istyukov_t
...
kozlov_ale
| Author | SHA1 | Date | |
|---|---|---|---|
| a90d980837 | |||
| 1cd312ba98 | |||
| ea025d0b4a | |||
| 43ec6863e4 | |||
|
|
e88d1e7fcd | ||
|
|
915ec905c6 | ||
| 4bdc8ea733 | |||
| 236483abf1 | |||
| 8673873a52 | |||
| 5035ca66da | |||
|
|
b9503c8388 | ||
|
|
7c2ddd98f6 | ||
|
|
be4ae51c11 | ||
|
|
5a83f61bd4 | ||
| 4498fb5531 | |||
| fdda9659eb | |||
| 099679a413 | |||
| dfa7f803fd | |||
| c527892559 | |||
| d915c4d712 | |||
| b5fc91cfdb | |||
| ac68008d93 | |||
| 75e614aa55 | |||
| 45dc8c70ea | |||
| 86d0b82b5a | |||
| 23e62553d2 | |||
| 7d5463198a | |||
| b04582b80e | |||
| d8470fb939 | |||
|
|
8fae6c78f8 | ||
| 04c0621a05 | |||
| 0ce611b443 | |||
| fc5942cdb1 | |||
| 68d1b445a2 | |||
| eb27f1410a | |||
| 4bec95e80f | |||
| d0c010c491 | |||
|
|
790641d82f | ||
|
|
ccc3352aa2 | ||
|
|
fece83fa1a | ||
|
|
ba4a6f1402 | ||
| 71b16e78b7 | |||
| 97493ae413 | |||
| b58da2aab4 | |||
| 4f479043f1 |
143
.idea/workspace.xml
generated
@@ -4,10 +4,9 @@
|
||||
<option name="autoReloadType" value="SELECTIVE" />
|
||||
</component>
|
||||
<component name="ChangeListManager">
|
||||
<list default="true" id="0ceb130e-88da-4a20-aad6-17f5ab4226ac" name="Changes" comment="">
|
||||
<change beforePath="$PROJECT_DIR$/.idea/IIS_2023_1.iml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/IIS_2023_1.iml" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/.idea/misc.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/misc.xml" afterDir="false" />
|
||||
<list default="true" id="0ceb130e-88da-4a20-aad6-17f5ab4226ac" name="Changes" comment="commit 3">
|
||||
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/istyukov_timofey_lab1/lab1.py" beforeDir="false" afterPath="$PROJECT_DIR$/istyukov_timofey_lab1/lab1.py" afterDir="false" />
|
||||
</list>
|
||||
<option name="SHOW_DIALOG" value="false" />
|
||||
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
||||
@@ -43,29 +42,50 @@
|
||||
<option name="hideEmptyMiddlePackages" value="true" />
|
||||
<option name="showLibraryContents" value="true" />
|
||||
</component>
|
||||
<component name="PropertiesComponent">{
|
||||
"keyToString": {
|
||||
"RunOnceActivity.OpenProjectViewOnStart": "true",
|
||||
"RunOnceActivity.ShowReadmeOnStart": "true",
|
||||
"WebServerToolWindowFactoryState": "false",
|
||||
"git-widget-placeholder": "senkin__alexander__lab__1",
|
||||
"last_opened_file_path": "D:/ulstukek/Course4/IIS/labs",
|
||||
"node.js.detected.package.eslint": "true",
|
||||
"node.js.detected.package.tslint": "true",
|
||||
"node.js.selected.package.eslint": "(autodetect)",
|
||||
"node.js.selected.package.tslint": "(autodetect)",
|
||||
"nodejs_package_manager_path": "npm",
|
||||
"settings.editor.selected.configurable": "reference.settings.ide.settings.new.ui",
|
||||
"vue.rearranger.settings.migration": "true"
|
||||
<component name="PropertiesComponent"><![CDATA[{
|
||||
"keyToString": {
|
||||
"RunOnceActivity.OpenProjectViewOnStart": "true",
|
||||
"RunOnceActivity.ShowReadmeOnStart": "true",
|
||||
"WebServerToolWindowFactoryState": "false",
|
||||
"git-widget-placeholder": "senkin__alexander__lab__1",
|
||||
"last_opened_file_path": "D:/ulstukek/Course4/IIS/labs",
|
||||
"node.js.detected.package.eslint": "true",
|
||||
"node.js.detected.package.tslint": "true",
|
||||
"node.js.selected.package.eslint": "(autodetect)",
|
||||
"node.js.selected.package.tslint": "(autodetect)",
|
||||
"nodejs_package_manager_path": "npm",
|
||||
"settings.editor.selected.configurable": "preferences.sourceCode",
|
||||
"vue.rearranger.settings.migration": "true"
|
||||
}
|
||||
}</component>
|
||||
}]]></component>
|
||||
<component name="RecentsManager">
|
||||
<key name="CopyFile.RECENT_KEYS">
|
||||
<recent name="D:\ulstukek\Course4\IIS\IISLabs\IIS_2023_1\zavrazhnova_svetlana_lab_3" />
|
||||
<recent name="D:\ulstukek\Course4\IIS\IISLabs\IIS_2023_1\zavrazhnova_svetlana_lab_1" />
|
||||
</key>
|
||||
</component>
|
||||
<component name="RunManager">
|
||||
<component name="RunManager" selected="Python.lab1">
|
||||
<configuration name="lab1" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||
<module name="IIS_2023_1" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="E:\Programms\Python\python.exe" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/istyukov_timofey_lab1" />
|
||||
<option name="IS_MODULE_SDK" value="false" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/istyukov_timofey_lab1/lab1.py" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="zavrazhnova_svetlana_lab3_2" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||
<module name="IIS_2023_1" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
@@ -132,13 +152,19 @@
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<list>
|
||||
<item itemvalue="Python.lab1" />
|
||||
<item itemvalue="Python.zavrazhnova_svetlana_lab3_2" />
|
||||
<item itemvalue="Python.zavrazhnova_svetlana_lab_2" />
|
||||
<item itemvalue="Python.zavrazhnova_svetlana_lab_3_1" />
|
||||
</list>
|
||||
<recent_temporary>
|
||||
<list>
|
||||
<item itemvalue="Python.lab1" />
|
||||
<item itemvalue="Python.zavrazhnova_svetlana_lab_3_1" />
|
||||
<item itemvalue="Python.zavrazhnova_svetlana_lab_2" />
|
||||
<item itemvalue="Python.zavrazhnova_svetlana_lab3_2" />
|
||||
<item itemvalue="Python.zavrazhnova_svetlana_lab3_2" />
|
||||
<item itemvalue="Python.zavrazhnova_svetlana_lab_3_1" />
|
||||
</list>
|
||||
</recent_temporary>
|
||||
</component>
|
||||
@@ -153,6 +179,35 @@
|
||||
<workItem from="1697735437405" duration="1706000" />
|
||||
<workItem from="1697740229646" duration="3802000" />
|
||||
</task>
|
||||
<task id="LOCAL-00001" summary="commit 1">
|
||||
<created>1702203771661</created>
|
||||
<option name="number" value="00001" />
|
||||
<option name="presentableId" value="LOCAL-00001" />
|
||||
<option name="project" value="LOCAL" />
|
||||
<updated>1702203771661</updated>
|
||||
</task>
|
||||
<task id="LOCAL-00002" summary="commit 2">
|
||||
<created>1702208133904</created>
|
||||
<option name="number" value="00002" />
|
||||
<option name="presentableId" value="LOCAL-00002" />
|
||||
<option name="project" value="LOCAL" />
|
||||
<updated>1702208133904</updated>
|
||||
</task>
|
||||
<task id="LOCAL-00003" summary="create README">
|
||||
<created>1702208193675</created>
|
||||
<option name="number" value="00003" />
|
||||
<option name="presentableId" value="LOCAL-00003" />
|
||||
<option name="project" value="LOCAL" />
|
||||
<updated>1702208193675</updated>
|
||||
</task>
|
||||
<task id="LOCAL-00004" summary="commit 3">
|
||||
<created>1702208529340</created>
|
||||
<option name="number" value="00004" />
|
||||
<option name="presentableId" value="LOCAL-00004" />
|
||||
<option name="project" value="LOCAL" />
|
||||
<updated>1702208529340</updated>
|
||||
</task>
|
||||
<option name="localTasksCounter" value="5" />
|
||||
<servers />
|
||||
</component>
|
||||
<component name="TypeScriptGeneratedFilesManager">
|
||||
@@ -169,7 +224,14 @@
|
||||
<entry key="branch">
|
||||
<value>
|
||||
<list>
|
||||
<option value="HEAD" />
|
||||
<option value="istyukov_timofey_lab_1" />
|
||||
</list>
|
||||
</value>
|
||||
</entry>
|
||||
<entry key="user">
|
||||
<value>
|
||||
<list>
|
||||
<option value="*" />
|
||||
</list>
|
||||
</value>
|
||||
</entry>
|
||||
@@ -180,8 +242,43 @@
|
||||
</entry>
|
||||
</map>
|
||||
</option>
|
||||
<option name="RECENT_FILTERS">
|
||||
<map>
|
||||
<entry key="Branch">
|
||||
<value>
|
||||
<list>
|
||||
<RecentGroup>
|
||||
<option name="FILTER_VALUES">
|
||||
<option value="istyukov_timofey_lab_1" />
|
||||
</option>
|
||||
</RecentGroup>
|
||||
<RecentGroup>
|
||||
<option name="FILTER_VALUES">
|
||||
<option value="HEAD" />
|
||||
</option>
|
||||
</RecentGroup>
|
||||
</list>
|
||||
</value>
|
||||
</entry>
|
||||
<entry key="User">
|
||||
<value>
|
||||
<list>
|
||||
<RecentGroup>
|
||||
<option name="FILTER_VALUES">
|
||||
<option value="*" />
|
||||
</option>
|
||||
</RecentGroup>
|
||||
</list>
|
||||
</value>
|
||||
</entry>
|
||||
</map>
|
||||
</option>
|
||||
</component>
|
||||
<component name="com.intellij.coverage.CoverageDataManagerImpl">
|
||||
<SUITE FILE_PATH="coverage/PyCharmProjects$senkin_alexander_lab_1.coverage" NAME="senkin_alexander_lab_1 Coverage Results" MODIFIED="1697744262965" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/senkin_alexander_lab_1" />
|
||||
<component name="VcsManagerConfiguration">
|
||||
<MESSAGE value="commit 1" />
|
||||
<MESSAGE value="commit 2" />
|
||||
<MESSAGE value="create README" />
|
||||
<MESSAGE value="commit 3" />
|
||||
<option name="LAST_COMMIT_MESSAGE" value="commit 3" />
|
||||
</component>
|
||||
</project>
|
||||
46
arzamaskina_milana_lab_7/README.md
Normal file
@@ -0,0 +1,46 @@
|
||||
# Лабораторная работа №7
|
||||
|
||||
## Рекуррентная нейронная сеть и задача генерации текста
|
||||
|
||||
#### ПИбд-41 Арзамаскина Милана
|
||||
#### Вариант №2
|
||||
|
||||
### Какие технологии использовались:
|
||||
|
||||
Используемые библиотеки:
|
||||
* numpy
|
||||
* keras
|
||||
* tensorflow
|
||||
|
||||
### Как запустить:
|
||||
|
||||
* установить python, numpy, keras, tensorflow
|
||||
* запустить проект (стартовая точка - main.py)
|
||||
|
||||
### Что делает программа:
|
||||
|
||||
На основе выбранных художественных текстов происходит обучение рекуррентной нейронной сети для решения задачи генерации.
|
||||
Необходимо подобрать архитектуру и параметры так, чтобы приблизиться к максимально осмысленному результату.
|
||||
|
||||
* Читает текст из файлов (english.txt, russian.txt)
|
||||
* Получает входные, выходные данные (X, y), размер словаря и токенайзер. Используем Tokenizer с настройкой char_level=True
|
||||
* Создаёт объект Sequential (последовательная рекуррентная нейронная сеть) и добавление двух слоёв LSTM. Dropout — это метод регуляризации для нейронных сетей и моделей глубокого обучения, решение проблемы переобучения. Слой Dense с функцией активации softmax используется для предсказания следующего слова
|
||||
* Компилирует модель
|
||||
* Обучает модель
|
||||
* Генерирует текст
|
||||
|
||||
|
||||
#### Сгенерированные тексты:
|
||||
|
||||
Генерация на русском языке:
|
||||
|
||||

|
||||
|
||||
Генерация на английском языке:
|
||||
|
||||

|
||||
|
||||
|
||||
### Вывод:
|
||||
|
||||
Программа способна сгенерировать осмысленный текст в каждом из случаев.
|
||||
8
arzamaskina_milana_lab_7/english.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
The cloud shuddered with blue flame. Thunder rumbled slowly.
|
||||
It either intensified or almost died down. And the rain, obeying the thunder, began to fall harder at times and rustle widely through the leaves, then stopped.
|
||||
Soon the sun broke through the clouds. The old Pushkin Park in Mikhailovskoye and the steep banks of Soroti were ablaze with red clay and wet grass.
|
||||
A slender rainbow lit up across the cloudy distance. It sparkled and smoked, surrounded by wisps of ashen clouds.
|
||||
The rainbow looked like an arch erected on the border of a protected land. Here, in Pushkin’s places, thoughts about the Russian language arose with particular force.
|
||||
Here Pushkin wandered with his head uncovered, with his cold hair tangled by the autumn wind, listening to the wet hum of the pine tops, looking, squinting,
|
||||
from where the autumn clouds rush, I rushed around the fairs. Here wonderful words overwhelmed him, oppressed his soul and, finally, were composed, one by one, with the stub of a goose feather, into ringing stanzas.
|
||||
|
||||
BIN
arzamaskina_milana_lab_7/img1.png
Normal file
|
After Width: | Height: | Size: 106 KiB |
BIN
arzamaskina_milana_lab_7/img2.png
Normal file
|
After Width: | Height: | Size: 103 KiB |
62
arzamaskina_milana_lab_7/main.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import numpy as np
|
||||
from keras.layers import LSTM, Dense
|
||||
from keras.models import Sequential
|
||||
from keras.preprocessing.sequence import pad_sequences
|
||||
from keras.preprocessing.text import Tokenizer
|
||||
|
||||
# Чтение текста из файла
|
||||
# with open('russian.txt', 'r', encoding='utf-8') as file:
|
||||
# text = file.read()
|
||||
with open('english.txt', 'r', encoding='utf-8') as file:
|
||||
text = file.read()
|
||||
|
||||
# Обучение Tokenizer на тексте
|
||||
tokenizer = Tokenizer(char_level=True)
|
||||
tokenizer.fit_on_texts([text])
|
||||
sequences = tokenizer.texts_to_sequences([text])[0]
|
||||
|
||||
# Создание x, y последовательностей
|
||||
X_data, y_data = [], []
|
||||
seq_length = 10
|
||||
for i in range(seq_length, len(sequences)):
|
||||
sequence = sequences[i - seq_length:i]
|
||||
target = sequences[i]
|
||||
X_data.append(sequence)
|
||||
y_data.append(target)
|
||||
|
||||
# Преобразование в массивы
|
||||
X_mass = pad_sequences(X_data, maxlen=seq_length)
|
||||
y_mass = np.array(y_data)
|
||||
|
||||
# Создание модели
|
||||
vocab_size = len(tokenizer.word_index) + 1
|
||||
model = Sequential()
|
||||
model.add(LSTM(256, input_shape=(seq_length, 1), return_sequences=True))
|
||||
model.add(LSTM(128, input_shape=(seq_length, 1)))
|
||||
model.add(Dense(vocab_size, activation='softmax'))
|
||||
|
||||
# Компиляция
|
||||
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
|
||||
|
||||
# Обучение
|
||||
model.fit(X_mass, y_mass, epochs=100, verbose=1)
|
||||
|
||||
# Функция генерации
|
||||
def generate_text(_text, gen_length):
|
||||
generated_text = _text
|
||||
for _ in range(gen_length):
|
||||
seq = tokenizer.texts_to_sequences([_text])[0]
|
||||
seq = pad_sequences([seq], maxlen=seq_length)
|
||||
prediction = model.predict(seq)[0]
|
||||
predicted_index = np.argmax(prediction)
|
||||
predicted_char = tokenizer.index_word[predicted_index]
|
||||
generated_text += predicted_char
|
||||
_text += predicted_char
|
||||
_text = _text[1:]
|
||||
return generated_text
|
||||
|
||||
# Генерация текста
|
||||
# _text = "Она сверкала"
|
||||
_text = "It sparkled and smoked"
|
||||
generate_text = generate_text(_text, 250)
|
||||
print(generate_text)
|
||||
7
arzamaskina_milana_lab_7/russian.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
Тучу передернуло синим пламенем. Медленно загремел гром.
|
||||
Он то усиливался, то почти затихал. И дождь, подчиняясь грому, начал временами идти сильнее и широко шуметь по листве, потом останавливался.
|
||||
Вскоре сквозь тучи пробилось солнце. Старый пушкинский парк в Михайловском и крутые берега Сороти запылали рыжей глиной и мокрой травой.
|
||||
Стройная радуга зажглась нал пасмурной далью. Она сверкала и дымилась, окруженная космами пепельных туч.
|
||||
Радуга была похожа на арку, воздвигнутую на границе заповедной земли. С особенной силой здесь, в пушкинских местах, возникали мысли о русском языке.
|
||||
Здесь Пушкин бродил с непокрытой головой, со спутанными осенним ветром холодными волосами, слушал влажный гул сосновых вершин, смотрел, прищурившись,
|
||||
откуда несутся осенние тучи, толкался по ярмаркам. Здесь чудесные слова переполняли его, стесняли его душу и, наконец, слагались по огрызком гусиного пера в звенящие строфы.
|
||||
136
basharin_sevastyan_lab_7/69209.txt
Normal file
@@ -0,0 +1,136 @@
|
||||
Annotation
|
||||
|
||||
The Fellowship of the Ring is the first part of J.R.R.Tolkien's epic adventure, The Lord Of The Rings.
|
||||
Sauron, the Dark Lord, has gathered to him all the Rings of Power - the means by which he intends to rule Middle-earth. All he lacks in his plans for dominion is the One Ring - the ring that rules them all - which has fallen into the hands of the hobbit Bilbo Baggins.
|
||||
In a sleepy village in the Shire, young Frodo Baggins finds himself faced with an immense task, as his elderly cousin Bilbo entrusts the Ring to his care. Frodo must leave his home and make a perilous journey across Middle-earh to the Cracks of Doom, there to destroy the Ring and foil the Dark Lord in his evil purpose.
|
||||
* * *
|
||||
JRR Tolkien The Lord of the Ring 1 - The Fellowship of the Ring
|
||||
Table of Contents
|
||||
|
||||
Foreward
|
||||
|
||||
This tale grew in the telling, until it became a history of the Great War of the Ring and included many glimpses of the yet more ancient history that preceded it. It was begun soon afterThe Hobbit was written and before its publication in 1937; but I did not go on with this sequel, for I wished first to complete and set in order the mythology and legends of the Elder Days, which had then been taking shape for some years. I desired to do this for my own satisfaction, and I had little hope that other people would be interested in this work, especially since it was primarily linguistic in inspiration and was begun in order to provide the necessary background of 'history' for Elvish tongues.
|
||||
|
||||
When those whose advice and opinion I sought correctedlittle hope tono hope, I went back to the sequel, encouraged by requests from readers for more information concerning hobbits and their adventures. But the story was drawn irresistibly towards the older world, and became an account, as it were, of its end and passing away before its beginning and middle had been told. The process had begun in the writing ofThe Hobbit, in which there were already some references to the older matter: Elrond, Gondolin, the High-elves, and the orcs, as well as glimpses that had arisen unbidden of things higher or deeper or darker than its surface: Durin, Moria, Gandalf, the Necromancer, the Ring. The discovery of the significance of these glimpses and of their relation to the ancient histories revealed the Third Age and its culmination in the War of the Ring.
|
||||
|
||||
Those who had asked for more information about hobbits eventually got it, but they had to wait a long time; for the composition ofThe Lord of the Rings went on at intervals during the years 1936 to 1949, a period in which I had many duties that I did not neglect, and many other interests as a learner and teacher that often absorbed me. The delay was, of course, also increased by the outbreak of war in 1939, by the end of which year the tale had not yet reached the end of Book One. In spite of the darkness of the next five years I found that the story could not now be wholly abandoned, and I plodded on, mostly by night, till I stood by Balin's tomb in Moria. There I halted for a long while. It was almost a year later when I went on and so came to Lothlorien and the Great River late in 1941. In the next year I wrote the first drafts of the matter that now stands as Book Three, and the beginnings of chapters I and III of Book Five; and there as the beacons flared in Anorien and Theoden came to Harrowdale I stopped. Foresight had failed and there was no time for thought.
|
||||
|
||||
It was during 1944 that, leaving the loose ends and perplexities of a war which it was my task to conduct, or at least to report, 1 forced myself to tackle the journey of Frodo to Mordor. These chapters, eventually to become Book Four, were written and sent out as a serial to my son, Christopher, then in South Africa with the RAF. Nonetheless it took another five years before the tale was brought to its present end; in that time I changed my house, my chair, and my college, and the days though less dark were no less laborious. Then when the 'end' had at last been reached the whole story had to be revised, and indeed largely re-written backwards. And it had to be typed, and re-typed: by me; the cost of professional typing by the ten-fingered was beyond my means.
|
||||
|
||||
The Lord of the Ringshas been read by many people since it finally appeared in print; and I should like to say something here with reference to the many opinions or guesses that I have received or have read concerning the motives and meaning of the tale. The prime motive was the desire of a tale-teller to try his hand at a really long story that would hold the attention of readers, amuse them, delight them, and at times maybe excite them or deeply move them. As a guide I had only my own feelings for what is appealing or moving, and for many the guide was inevitably often at fault. Some who have read the book, or at any rate have reviewed it, have found it boring, absurd, or contemptible; and I have no cause to complain, since I have similar opinions of their works, or of the kinds of writing that they evidently prefer. But even from the points of view of many who have enjoyed my story there is much that fails to please. It is perhaps not possible in a long tale to please everybody at all points, nor to displease everybody at the same points; for I find from the letters that I have received that the passages or chapters that are to some a blemish are all by others specially approved. The most critical reader of all, myself, now finds many defects, minor and major, but being fortunately under no obligation either to review the book or to write it again, he will pass over these in silence, except one that has been noted by others: the book is too short.
|
||||
|
||||
As for any inner meaning or 'message', it has in the intention of the author none. It is neither allegorical nor topical. As the story grew it put down roots (into the past) and threw out unexpected branches: but its main theme was settled from the outset by the inevitable choice of the Ring as the link between it andThe Hobbit. The crucial chapter, "The Shadow of the Past', is one of the oldest parts of the tale. It was written long before the foreshadow of 1939 had yet become a threat of inevitable disaster, and from that point the story would have developed along essentially the same lines, if that disaster had been averted. Its sources are things long before in mind, or in some cases already written, and little or nothing in it was modified by the war that began in 1939 or its sequels.
|
||||
|
||||
The real war does not resemble the legendary war in its process or its conclusion. If it had inspired or directed the development of the legend, then certainly the Ring would have been seized and used against Sauron; he would not have been annihilated but enslaved, and Barad-dur would not have been destroyed but occupied. Saruman, failing to get possession of the Ring, would m the confusion and treacheries of the time have found in Mordor the missing links in his own researches into Ring-lore, and before long he would have made a Great Ring of his own with which to challenge the self-styled Ruler of Middle-earth. In that conflict both sides would have held hobbits in hatred and contempt: they would not long have survived even as slaves.
|
||||
|
||||
Other arrangements could be devised according to the tastes or views of those who like allegory or topical reference. But I cordially dislike allegory in all its manifestations, and always have done so since I grew old and wary enough to detect its presence. I much prefer history, true or feigned, with its varied applicability to the thought and experience of readers. I think that many confuse 'applicability' with 'allegory'; but the one resides in the freedom of the reader, and the other in the purposed domination of the author.
|
||||
|
||||
An author cannot of course remain wholly unaffected by his experience, but the ways in which a story-germ uses the soil of experience are extremely complex, and attempts to define the process are at best guesses from evidence that is inadequate and ambiguous. It is also false, though naturally attractive, when the lives of an author and critic have overlapped, to suppose that the movements of thought or the events of times common to both were necessarily the most powerful influences. One has indeed personally to come under the shadow of war to feel fully its oppression; but as the years go by it seems now often forgotten that to be caught in youth by 1914 was no less hideous an experience than to be involved in 1939 and the following years. By 1918 all but one of my close friends were dead. Or to take a less grievous matter: it has been supposed by some that "The Scouring of the Shire' reflects the situation in England at the time when I was finishing my tale. It does not. It is an essential part of the plot, foreseen from the outset, though in the event modified by the character of Saruman as developed in the story without, need I say, any allegorical significance or contemporary political reference whatsoever. It has indeed some basis in experience, though slender (for the economic situation was entirely different), and much further back. The country in which I lived in childhood was being shabbily destroyed before I was ten, in days when motor-cars were rare objects (I had never seen one) and men were still building suburban railways. Recently I saw in a paper a picture of the last decrepitude of the once thriving corn-mill beside its pool that long ago seemed to me so important. I never liked the looks of the Young miller, but his father, the Old miller, had a black beard, and he was not named Sandyman.
|
||||
|
||||
The Lord of the Ringsis now issued in a new edition, and the opportunity has been taken of revising it. A number of errors and inconsistencies that still remained in the text have been corrected, and an attempt has been made to provide information on a few points which attentive readers have raised. I have considered all their comments and enquiries, and if some seem to have been passed over that may be because I have failed to keep my notes in order; but many enquiries could only be answered by additional appendices, or indeed by the production of an accessory volume containing much of the material that I did not include in the original edition, in particular more detailed linguistic information. In the meantime this edition offers this Foreword, an addition to the Prologue, some notes, and an index of the names of persons and places. This index is in intention complete in items but not in references, since for the present purpose it has been necessary to reduce its bulk. A complete index, making full use of the material prepared for me by Mrs. N. Smith, belongs rather to the accessory volume.
|
||||
|
||||
Prologue
|
||||
|
||||
This book is largely concerned with Hobbits, and from its pages a reader may discover much of their character and a little of their history. Further information will also be found in the selection from the Red Book of Westmarch that has already been published, under the title ofThe Hobbit . That story was derived from the earlier chapters of the Red Book, composed by Bilbo himself, the first Hobbit to become famous in the world at large, and called by himThere and Back Again, since they told of his journey into the East and his return: an adventure which later involved all the Hobbits in the great events of that Age that are here related.
|
||||
|
||||
Many, however, may wish to know more about this remarkable people from the outset, while some may not possess the earlier book. For such readers a few notes on the more important points are here collected from Hobbit-lore, and the first adventure is briefly recalled.
|
||||
|
||||
Hobbits are an unobtrusive but very ancient people, more numerous formerly than they are today; for they love peace and quiet and good tilled earth: a well-ordered and well-farmed countryside was their favourite haunt. They do not and did not understand or like machines more complicated than a forge-bellows, a water-mill, or a hand-loom, though they were skilful with tools. Even in ancient days they were, as a rule, shy of 'the Big Folk', as they call us, and now they avoid us with dismay and are becoming hard to find. They are quick of hearing and sharp-eyed, and though they are inclined to be fat and do not hurry unnecessarily, they are nonetheless nimble and deft in their movements. They possessed from the first the art of disappearing swiftly and silently, when large folk whom they do not wish to meet come blundering by; and this an they have developed until to Men it may seem magical. But Hobbits have never, in fact, studied magic of any kind, and their elusiveness is due solely to a professional skill that heredity and practice, and a close friendship with the earth, have rendered inimitable by bigger and clumsier races.
|
||||
|
||||
For they are a little people, smaller than Dwarves: less tout and stocky, that is, even when they are not actually much shorter. Their height is variable, ranging between two and four feet of our measure. They seldom now reach three feet; but they hive dwindled, they say, and in ancient days they were taller. According to the Red Book, Bandobras Took (Bullroarer), son of Isengrim the Second, was four foot five and able to ride a horse. He was surpassed in all Hobbit records only by two famous characters of old; but that curious matter is dealt with in this book.
|
||||
|
||||
As for the Hobbits of the Shire, with whom these tales are concerned, in the days of their peace and prosperity they were a merry folk. They dressed in bright colours, being notably fond of yellow and green; but they seldom wore shoes, since their feet had tough leathery soles and were clad in a thick curling hair, much like the hair of their heads, which was commonly brown. Thus, the only craft little practised among them was shoe-making; but they had long and skilful fingers and could make many other useful and comely things. Their faces were as a rule good-natured rather than beautiful, broad, bright-eyed, red-cheeked, with mouths apt to laughter, and to eating and drinking. And laugh they did, and eat, and drink, often and heartily, being fond of simple jests at all times, and of six meals a day (when they could get them). They were hospitable and delighted in parties, and in presents, which they gave away freely and eagerly accepted.
|
||||
|
||||
It is plain indeed that in spite of later estrangement Hobbits are relatives of ours: far nearer to us than Elves, or even than Dwarves. Of old they spoke the languages of Men, after their own fashion, and liked and disliked much the same things as Men did. But what exactly our relationship is can no longer be discovered. The beginning of Hobbits lies far back in the Elder Days that are now lost and forgotten. Only the Elves still preserve any records of that vanished time, and their traditions are concerned almost entirely with their own history, in which Men appear seldom and Hobbits are not mentioned at all. Yet it is clear that Hobbits had, in fact, lived quietly in Middle-earth for many long years before other folk became even aware of them. And the world being after all full of strange creatures beyond count, these little people seemed of very little importance. But in the days of Bilbo, and of Frodo his heir, they suddenly became, by no wish of their own, both important and renowned, and troubled the counsels of the Wise and the Great.
|
||||
|
||||
Those days, the Third Age of Middle-earth, are now long past, and the shape of all lands has been changed; but the regions in which Hobbits then lived were doubtless the same as those in which they still linger: the North-West of the Old World, east of the Sea. Of their original home the Hobbits in Bilbo's time preserved no knowledge. A love of learning (other than genealogical lore) was far from general among them, but there remained still a few in the older families who studied their own books, and even gathered reports of old times and distant lands from Elves, Dwarves, and Men. Their own records began only after the settlement of the Shire, and their most ancient legends hardly looked further back than their Wandering Days. It is clear, nonetheless, from these legends, and from the evidence of their peculiar words and customs, that like many other folk Hobbits had in the distant past moved westward. Their earliest tales seem to glimpse a time when they dwelt in the upper vales of Anduin, between the eaves of Greenwood the Great and the Misty Mountains. Why they later undertook the hard and perilous crossing of the mountains into Eriador is no longer certain. Their own accounts speak of the multiplying of Men in the land, and of a shadow that fell on the forest, so that it became darkened and its new name was Mirkwood.
|
||||
|
||||
Before the crossing of the mountains the Hobbits had already become divided into three somewhat different breeds: Harfoots, Stoors, and Fallohides. The Harfoots were browner of skin, smaller, and shorter, and they were beardless and bootless; their hands and feet were neat and nimble; and they preferred highlands and hillsides. The Stoors were broader, heavier in build; their feet and hands were larger, and they preferred flat lands and riversides. The Fallohides were fairer of skin and also of hair, and they were taller and slimmer than the others; they were lovers of trees and of woodlands.
|
||||
|
||||
The Harfoots had much to do with Dwarves in ancient times, and long lived in the foothills of the mountains. They moved westward early, and roamed over Eriador as far as Weathertop while the others were still in the Wilderland. They were the most normal and representative variety of Hobbit, and far the most numerous. They were the most inclined to settle in one place, and longest preserved their ancestral habit of living in tunnels and holes.
|
||||
|
||||
The Stoors lingered long by the banks of the Great River Anduin, and were less shy of Men. They came west after the Harfoots and followed the course of the Loudwater southwards; and there many of them long dwelt between Tharbad and the borders of Dunland before they moved north again.
|
||||
|
||||
The Fallohides, the least numerous, were a northerly branch. They were more friendly with Elves than the other Hobbits were, and had more skill in language and song than in handicrafts; and of old they preferred hunting to tilling. They crossed the mountains north of Rivendell and came down the River Hoarwell. In Eriador they soon mingled with the other kinds that had preceded them, but being somewhat bolder and more adventurous, they were often found as leaders or chieftains among clans of Harfoots or Stoors. Even in Bilbo's time the strong Fallohidish strain could still be noted among the greater families, such as the Tooks and the Masters of Buckland.
|
||||
|
||||
In the westlands of Eriador, between the Misty Mountains and the Mountains of Lune, the Hobbits found both Men and Elves. Indeed, a remnant still dwelt there of the Dunedain, the kings of Men that came over the Sea out of Westernesse; but they were dwindling fast and the lands of their North Kingdom were falling far and wide into waste. There was room and to spare for incomers, and ere long the Hobbits began to settle in ordered communities. Most of their earlier settlements had long disappeared and been forgotten in Bilbo's time; but one of the first to become important still endured, though reduced in size; this was at Bree and in the Chetwood that lay round about, some forty miles east of the Shire.
|
||||
|
||||
It was in these early days, doubtless, that the Hobbits learned their letters and began to write after the manner of the Dunedain, who had in their turn long before learned the art from the Elves. And in those days also they forgot whatever languages they had used before, and spoke ever after the Common Speech, the Westron as it was named, that was current through all the lands of the kings from Arnor to Gondor, and about all the coasts of the Sea from Belfalas to Lune. Yet they kept a few words of their own, as well as their own names of months and days, and a great store of personal names out of the past.
|
||||
|
||||
About this time legend among the Hobbits first becomes history with a reckoning of years. For it was in the one thousand six hundred and first year of the Third Age that the Fallohide brothers, Marcho and Blanco, set out from Bree; and having obtained permission from the high king at Fornost, they crossed the brown river Baranduin with a great following of Hobbits. They passed over the Bridge of Stonebows, that had been built in the days of the power of the North Kingdom, and they took ail the land beyond to dwell in, between the river and the Far Downs. All that was demanded of them was that they should keep the Great Bridge in repair, and all other bridges and roads, speed the king's messengers, and acknowledge his lordship.
|
||||
|
||||
Thus began theShire-reckoning, for the year of the crossing of the Brandywine (as the Hobbits turned the name) became Year One of the Shire, and all later dates were reckoned from it. At once the western Hobbits fell in love with their new land, and they remained there, and soon passed once more out of the history of Men and of Elves. While there was still a king they were in name his subjects, but they were, in fact, ruled by their own chieftains and meddled not at all with events in the world outside. To the last battle at Fornost with the Witch-lord of Angmar they sent some bowmen to the aid of the king, or so they maintained, though no tales of Men record it. But in that war the North Kingdom ended; and then the Hobbits took the land for their own, and they chose from their own chiefs a Thain to hold the authority of the king that was gone. There for a thousand years they were little troubled by wars, and they prospered and multiplied after the Dark Plague (S.R. 37) until the disaster of the Long Winter and the famine that followed it. Many thousands then perished, but the Days of Dearth (1158-60) were at the time of this tale long past and the Hobbits had again become accustomed to plenty. The land was rich and kindly, and though it had long been deserted when they entered it, it had before been well tilled, and there the king had once had many farms, cornlands, vineyards, and woods.
|
||||
|
||||
Forty leagues it stretched from the Far Downs to the Brandywine Bridge, and fifty from the northern moors to the marshes in the south. The Hobbits named it the Shire, as the region of the authority of their Thain, and a district of well-ordered business; and there in that pleasant comer of the world they plied their well-ordered business of living, and they heeded less and less the world outside where dark things moved, until they came to think that peace and plenty were the rule in Middle-earth and the right of all sensible folk. They forgot or ignored what little they had ever known of the Guardians, and of the labours of those that made possible the long peace of the Shire. They were, in fact, sheltered, but they had ceased to remember it.
|
||||
|
||||
At no time had Hobbits of any kind been warlike, and they had never fought among themselves. In olden days they had, of course, been often obliged to fight to maintain themselves in a hard world; but in Bilbo's time that was very ancient history. The last battle, before this story opens, and indeed the only one that had ever been fought within the borders of the Shire, was beyond living memory: the Battle of Greenfields, S.R. 1147, in which Bandobras Took routed an invasion of Orcs. Even the weathers had grown milder, and the wolves that had once come ravening out of the North in bitter white winters were now only a grandfather's tale. So, though there was still some store of weapons in the Shire, these were used mostly as trophies, hanging above hearths or on walls, or gathered into the museum at Michel Delving. The Mathom-house it was called; for anything that Hobbits had no immediate use for, but were unwilling to throw away, they called amathom . Their dwellings were apt to become rather crowded with mathoms, and many of the presents that passed from hand to hand were of that son.
|
||||
|
||||
Nonetheless, ease and peace had left this people still curiously tough. They were, if it came to it, difficult to daunt or to kill; and they were, perhaps, so unwearyingly fond of good things not least because they could, when put to it, do without them, and could survive rough handling by grief, foe, or weather in a way that astonished those who did not know them well and looked no further than their bellies and their well-fed faces. Though slow to quarrel, and for sport killing nothing that lived, they were doughty at bay, and at need could still handle arms. They shot well with the bow, for they were keen-eyed and sure at the mark. Not only with bows and arrows. If any Hobbit stooped for a stone, it was well to get quickly under cover, as all trespassing beasts knew very well.
|
||||
|
||||
All Hobbits had originally lived in holes in the ground, or so they believed, and in such dwellings they still felt most at home; but in the course of time they had been obliged to adopt other forms of abode. Actually in the Shire in Bilbo's days it was, as a rule, only the richest and the poorest Hobbits that maintained the old custom. The poorest went on living in burrows of the most primitive kind, mere holes indeed, with only one window or none; while the well-to-do still constructed more luxurious versions of the simple diggings of old. But suitable sites for these large and ramifying tunnels (orsmials as they called them) were not everywhere to be found; and in the flats and the low-lying districts the Hobbits, as they multiplied, began to build above ground. Indeed, even in the hilly regions and the older villages, such as Hobbiton or Tuckborough, or in the chief township of the Shire, Michel Delving on the White Downs, there were now many houses of wood, brick, or stone. These were specially favoured by millers, smiths, ropers, and cartwrights, and others of that sort; for even when they had holes to live in. Hobbits had long been accustomed to build sheds and workshops.
|
||||
|
||||
The habit of building farmhouses and barns was said to have begun among the inhabitants of the Marish down by the Brandywine. The Hobbits of that quarter, the Eastfarthing, were rather large and heavy-legged, and they wore dwarf-boots in muddy weather. But they were well known to be Stoors in a large part of their blood, as indeed was shown by the down that many grew on their chins. No Harfoot or Fallohide had any trace of a beard. Indeed, the folk of the Marish, and of Buckland, east of the River, which they afterwards occupied, came for the most part later into the Shire up from south-away; and they still had many peculiar names and strange words not found elsewhere in the Shire.
|
||||
|
||||
It is probable that the craft of building, as many other crafts beside, was derived from the Dunedain. But the Hobbits may have learned it direct from the Elves, the teachers of Men in their youth. For the Elves of the High Kindred had not yet forsaken Middle-earth, and they dwelt still at that time at the Grey Havens away to the west, and in other places within reach of the Shire. Three Elf-towers of immemorial age were still to be seen on the Tower Hills beyond the western marches. They shone far off in the moonlight. The tallest was furthest away, standing alone upon a green mound. The Hobbits of the Westfarthing said that one could see the Sea from the lop of that tower; but no Hobbit had ever been known to climb it. Indeed, few Hobbits had ever seen or sailed upon the Sea, and fewer still had ever returned to report it. Most Hobbits regarded even rivers and small boats with deep misgivings, and not many of them could swim. And as the days of the Shire lengthened they spoke less and less with the Elves, and grew afraid of them, and distrustful of those that had dealings with them; and the Sea became a word of fear among them, and a token of death, and they turned their faces away from the hills in the west.
|
||||
|
||||
The craft of building may have come from Elves or Men, but the Hobbits used it in their own fashion. They did not go in for towers. Their houses were usually long, low, and comfortable. The oldest kind were, indeed, no more than built imitations ofsmials, thatched with dry grass or straw, or roofed with turves, and having walls somewhat bulged. That stage, however, belonged to the early days of the Shire, and hobbit-building had long since been altered, improved by devices, learned from Dwarves, or discovered by themselves. A preference for round windows, and even round doors, was the chief remaining peculiarity of hobbit-architecture.
|
||||
|
||||
The houses and the holes of Shire-hobbits were often large, and inhabited by large families. (Bilbo and Frodo Baggins were as bachelors very exceptional, as they were also in many other ways, such as their friendship with the Elves.) Sometimes, as in the case of the Tooks of Great Smials, or the Brandybucks of Brandy Hall, many generations of relatives lived in (comparative) peace together in one ancestral and many-tunnelled mansion. All Hobbits were, in any case, clannish and reckoned up their relationships with great care. They drew long and elaborate family-trees with innumerable branches. In dealing with Hobbits it is important to remember who is related to whom, and in what degree. It would be impossible in this book to set out a family-tree that included even the more important members of the more important families at the time which these tales tell of. The genealogical trees at the end of the Red Book of Westmarch are a small book in themselves, and all but Hobbits would find them exceedingly dull. Hobbits delighted in such things, if they were accurate: they liked to have books filled with things that they already knew, set out fair and square with no contradictions.
|
||||
|
||||
There is another astonishing thing about Hobbits of old that must be mentioned, an astonishing habit: they imbibed or inhaled, through pipes of clay or wood, the smoke of the burning leaves of a herb, which they calledpipe-weed orleaf, a variety probably ofNicotiana. A great deal of mystery surrounds the origin of this peculiar custom, or 'art' as the Hobbits preferred to call it. All that could be discovered about it in antiquity was put together by Meriadoc Brandybuck (later Master of Buckland), and since he and the tobacco of the Southfarthing play a part in the history that follows, his remarks in the introduction to hisHerblore of the Shire may be quoted.
|
||||
|
||||
"This," he says, 'is the one art that we can certainly claim to be our own invention. When Hobbits first began to smoke is not known, all the legends and family histories take it for granted; for ages folk in the Shire smoked various herbs, some fouler, some sweeter. But all accounts agree that Tobold Hornblower of Longbottom in the Southfarthing first grew the true pipe-weed in his gardens in the days of Isengrim the Second, about the year 1070 of Shire-reckoning. The best home-grown still comes from that district, especially the varieties now known as Longbottom Leaf, Old Toby, and Southern Star.
|
||||
|
||||
"How Old Toby came by the plant is not recorded, for to his dying day he would not tell. He knew much about herbs, but he was no traveller. It is said that in his youth he went often to Bree, though he certainly never went further from the Shire than that. It is thus quite possible that he learned of this plant in Bree, where now, at any rate, it grows well on the south slopes of the hill. The Bree-hobbits claim to have been the first actual smokers of the pipe-weed. They claim, of course, to have done everything before the people of the Shire, whom they refer to as "colonists"; but in this case their claim is, I think, likely to be true. And certainly it was from Bree that the art of smoking the genuine weed spread in the recent centuries among Dwarves and such other folk, Rangers, Wizards, or wanderers, as still passed to and fro through that ancient road-meeting. The home and centre of the an is thus to be found in the old inn of Bree,The Prancing Pony, that has been kept by the family of Butterbur from time beyond record.
|
||||
|
||||
"All the same, observations that I have made on my own many journeys south have convinced me that the weed itself is not native to our parts of the world, but came northward from the lower Anduin, whither it was, I suspect, originally brought over Sea by the Men of Westernesse. It grows abundantly in Gondor, and there is richer and larger than in the North, where it is never found wild, and flourishes only in warm sheltered places like Longbottom. The Men of Gondor call itsweet galenas, and esteem it only for the fragrance of its flowers. From that land it must have been carried up the Greenway during the long centuries between the coming of Elendil and our own day. But even the Dunedain of Gondor allow us this credit: Hobbits first put it into pipes. Not even the Wizards first thought of that before we did. Though one Wizard that I knew took up the art long ago, and became as skilful in it as in all other things that he put his mind to."
|
||||
|
||||
The Shire was divided into four quarters, the Farthings already referred to. North, South, East, and West; and these again each into a number of folklands, which still bore the names of some of the old leading families, although by the time of this history these names were no longer found only in their proper folklands. Nearly all Tooks still lived in the Tookland, but that was not true of many other families, such as the Bagginses or the Boffins. Outside the Farthings were the East and West Marches: the Buckland (see beginning of Chapter V, Book I); and the Westmarch added to the Shire in S.R. 1462.
|
||||
|
||||
The Shire at this time had hardly any 'government'. Families for the most part managed their own affairs. Growing food and eating it occupied most of their time. In other matters they were, as a rule, generous and not greedy, but contented and moderate, so that estates, farms, workshops, and small trades tended to remain unchanged for generations.
|
||||
|
||||
There remained, of course, the ancient tradition concerning the high king at Fornost, or Norbury as they called it, away north of the Shire. But there had been no king for nearly a thousand years, and even the ruins of Kings' Norbury were covered with grass. Yet the Hobbits still said of wild folk and wicked things (such as trolls) that they had not heard of the king. For they attributed to the king of old all their essential laws; and usually they kept the laws of free will, because they were The Rules (as they said), both ancient and just.
|
||||
|
||||
It is true that the Took family had long been pre-eminent; for the office of Thain had passed to them (from the Oldbucks) some centuries before, and the chief Took had borne that title ever since. The Thain was the master of the Shire-moot, and captain of the Shire-muster and the Hobbitry-in-arms, but as muster and moot were only held in times of emergency, which no longer occurred, the Thainship had ceased to be more than a nominal dignity. The Took family was still, indeed, accorded a special respect, for it remained both numerous and exceedingly wealthy, and was liable to produce in every generation strong characters of peculiar habits and even adventurous temperament. The latter qualities, however, were now rather tolerated (in the rich) than generally approved. The custom endured, nonetheless, of referring to the head of the family as The Took, and of adding to his name, if required, a number: such as Isengrim the Second, for instance.
|
||||
|
||||
The only real official in the Shire at this date was the Mayor of Michel Delving (or of the Shire), who was elected every seven years at the Free Fair on the White Downs at the Lithe, that is at Midsummer. As mayor almost his only duty was to preside at banquets, given on the Shire-holidays, which occurred at frequent intervals. But the offices of Postmaster and First Shirriff were attached to the mayoralty, so that he managed both the Messenger Service and the Watch. These were the only Shire-services, and the Messengers were the most numerous, and much the busier of the two. By no means all Hobbits were lettered, but those who were wrote constantly to all their friends (and a selection of their relations) who lived further off than an afternoon's walk.
|
||||
|
||||
The Shirriffs was the name that the Hobbits gave to their police, or the nearest equivalent that they possessed. They had, of course, no uniforms (such things being quite unknown), only a feather in their caps; and they were in practice rather haywards than policemen, more concerned with the strayings of beasts than of people. There were in all the Shire only twelve of them, three in each Farthing, for Inside Work. A rather larger body, varying at need, was employed to 'beat the bounds', and to see that Outsiders of any kind, great or small, did not make themselves a nuisance.
|
||||
|
||||
At the time when this story begins the Bounders, as they were called, had been greatly increased. There were many reports and complaints of strange persons and creatures prowling about the borders, or over them: the first sign that all was not quite as it should be, and always had been except in tales and legends of long ago. Few heeded the sign, and not even Bilbo yet had any notion of what it portended. Sixty years had passed since he set out on his memorable journey, and he was old even for Hobbits, who reached a hundred as often as not; but much evidently still remained of the considerable wealth that he had brought back. How much or how little he revealed to no one, not even to Frodo his favourite 'nephew'. And he still kept secret the ring that he bad found.
|
||||
|
||||
As is told in The Hobbit, there came one day to Bilbo's door the great Wizard, Gandalf the Grey, and thirteen dwarves with him: none other, indeed, than Thorin Oakenshield, descendant of kings, and his twelve companions in exile. With them he set out, to his own lasting astonishment, on a morning of April, it being then the year 1341 Shire-reckoning, on a quest of great treasure, the dwarf-hoards of the Kings under the Mountain, beneath Erebor in Dale, far off in the East. The quest was successful, and the Dragon that guarded the hoard was destroyed. Yet, though before all was won the Battle of Five Armies was fought, and Thorin was slain, and many deeds of renown were done, the matter would scarcely have concerned later history, or earned more than a note in the long annals of the Third Age, but for an 'accident' by the way. The party was assailed by Orcs in a high pass of the Misty Mountains as they went towards Wilderland; and so it happened that Bilbo was lost for a while in the black orc-mines deep under the mountains, and there, as he groped in vain in the dark, he put his hand on a ring, lying on the floor of a tunnel. He put it in his pocket. It seemed then like mere luck.
|
||||
|
||||
Trying to find his way out. Bilbo went on down to the roots of the mountains, until he could go no further. At the bottom of the tunnel lay a cold lake far from the light, and on an island of rock in the water lived Gollum. He was a loathsome little creature: he paddled a small boat with his large flat feet, peering with pale luminous eyes and catching blind fish with his long fingers, and eating them raw. He ate any living thing, even orc, if he could catch it and strangle it without a struggle. He possessed a secret treasure that had come to him long ages ago, when he still lived in the light: a ring of gold that made its wearer invisible. It was the one thing he loved, his 'precious', and he talked to it, even when it was not with him. For he kept it hidden safe in a hole on his island, except when he was hunting or spying on the ores of the mines.
|
||||
|
||||
Maybe he would have attacked Bilbo at once, if the ring had been on him when they met; but it was not, and the hobbit held in his hand an Elvish knife, which served him as a sword. So to gain time Gollum challenged Bilbo to the Riddle-game, saying that if he asked a riddle which Bilbo could not guess, then he would kill him and eat him; but if Bilbo defeated him, then he would do as Bilbo wished: he would lead him to a way out of the tunnels.
|
||||
|
||||
Since he was lost in the dark without hope, and could neither go on nor back. Bilbo accepted the challenge; and they asked one another many riddles. In the end Bilbo won the game, more by luck (as it seemed) than by wits; for he was stumped at last for a riddle to ask, and cried out, as his hand came upon the ring he lad picked up and forgotten:What haw I got in my pocket? This Gollum failed to answer, though he demanded three guesses.
|
||||
|
||||
The Authorities, it is true, differ whether this last question was a mere 'question' and not a 'riddle' according to the strict rules of the Game; but all agree that, after accepting it and trying to guess the answer, Gollum was bound by his promise. And Bilbo pressed him to keep his word; for the thought came to him that this slimy creature might prove false, even though such promises were held sacred, and of old all but the wickedest things feared to break them. But after ages alone in the dark Gollum's heart was black, and treachery was in it. He slipped away, and returned to the island, of which Bilbo knew nothing, not far off in the dark water. There, he thought, lay his ring. He was hungry now, and angry, and once his 'precious' was with him he would not fear any weapon at all.
|
||||
|
||||
But the ring was not on the island; he had lost it, it was gone. His screech sent a shiver down Bilbo's back, though he did not yet understand what had happened. But Gollum had at last leaped to a guess, too late.What has it got in its pocketses? he cried. The light in his eyes was like a green flame as he sped back to murder the hobbit and recover his 'precious'. Just in time Bilbo saw his peril, and he fled blindly up the passage away from the water; and once more he was saved by his luck. For just as he ran he put his hand in his pocket, and the ring slipped quietly on to his finger. So it was that Gollum passed him without seeing him, and went to guard the way out, lest the 'thief' should escape. Warily Bilbo followed him, as he went along, cursing, and talking to himself about his 'precious'; from which talk at last even Bilbo guessed the truth, and hope came to him in the darkness: he himself had found the marvellous ring and a chance of escape from the orcs and from Gollum.
|
||||
|
||||
At length they came to a halt before an unseen opening that led to the lower gates of the mines, on the eastward side of the mountains. There Gollum crouched at bay, smelling and listening; and Bilbo was tempted to slay him with his sword. But pity stayed him, and though he kept the ring, in which his only hope lay, he would not use it to help him kill the wretched creature at a disadvantage. In the end, gathering his courage, he leaped over Gollum in the dark, and fled away down the passage, pursued by his enemy's cries of hate and despair:Thief, thief! Baggins! We hates it for ever!
|
||||
|
||||
Now it is a curious fact that this is not the story as Bilbo first told it to his companions. To them his account was that Gollum had promised to give him apresent, if he won the game; but when Gollum went to fetch it from his island he found the treasure was gone: a magic ring, which had been given to him long ago on his birthday. Bilbo guessed that this was the very ring that he had found, and as he had won the game, it was already his by right. But being in a tight place, he said nothing about it, and made Gollum show him the way out, as a reward instead of a present. This account Bilbo set down in his memoirs, and he seems never to have altered it himself, not even after the Council of Elrond. Evidently it still appeared in the original Red Book, as it did in several of the copies and abstracts. But many copies contain the true account (as an alternative), derived no doubt from notes by Frodo or Samwise, both of whom learned the truth, though they seem to have been unwilling to delete anything actually written by the old hobbit himself.
|
||||
|
||||
Gandalf, however, disbelieved Bilbo's first story, as soon as he heard it, and he continued to be very curious about the ring. Eventually he got the true tale out of Bilbo after much questioning, which for a while strained their friendship; but the wizard seemed to think the truth important. Though he did not say so to Bilbo, he also thought it important, and disturbing, to find that the good hobbit had not told the truth from the first: quite contrary to his habit. The idea of a 'present' was not mere hobbitlike invention, all the same. It was suggested to Bilbo, as he confessed, by Gollum's talk that he overheard; for Gollum did, in fact, call the ring his 'birthday present', many times. That also Gandalf thought strange and suspicious; but he did not discover the truth in this point for many more years, as will be seen in this book.
|
||||
|
||||
Of Bilbo's later adventures little more need be said here. With the help of the ring he escaped from the orc-guards at the gate and rejoined his companions. He used the ring many times on his quest, chiefly for the help of his friends; but he kept it secret from them as long as he could. After his return to his home he never spoke of it again to anyone, save Gandalf and Frodo; and no one else in the Shire knew of its existence, or so he believed. Only to Frodo did he show the account of his Journey that he was writing.
|
||||
|
||||
His sword, Sting, Bilbo hung over his fireplace, and his coat of marvellous mail, the gift of the Dwarves from the Dragon-hoard, he lent to a museum, to the Michel Delving Mathom-house in fact. But he kept in a drawer at Bag End the old cloak and hood that he had worn on his travels; and the ring, secured by a fine chain, remained in his pocket.
|
||||
|
||||
He returned to his home at Bag End on June the 22nd in his fifty-second year (S.R. 1342), and nothing very notable occurred in the Shire until Mr. Baggins began the preparations for the celebration of his hundred-and-eleventh birthday (S.R. 1401). At this point this History begins.
|
||||
|
||||
At the end of the Third Age the part played by the Hobbits in the great events that led to the inclusion of the Shire in the Reunited Kingdom awakened among them a more widespread interest in their own history; and many of their traditions, up to that time still mainly oral, were collected and Written down. The greater families were also concerned with events in the Kingdom at large, and many of their members studied its ancient histories and legends. By the end of the first century of the Fourth Age there were already to be found in the Shire several libraries that contained many historical books and records.
|
||||
|
||||
The largest of these collections were probably at Undertowers, at Great Smials, and at Brandy Hall. This account of the end of the Third Age is drawn mainly from the Red Book of Westmarch. That most important source for the history of the War of the Ring was so called because it was long preserved at Undertowers, the home of the Fairbairns, Wardens of the Westmarch. It was in origin Bilbo's private diary, which he took with him to Rivendell. Frodo brought it back to the Shire, together with many loose leaves of notes, and during S.R. 1420-1 he nearly filled its pages with his account of the War. But annexed to it and preserved with it, probably m a single red case, were the three large volumes, bound in red leather, that Bilbo gave to him as a parting gift. To these four volumes there was added in Westmarch a fifth containing commentaries, genealogies, and various other matter concerning the hobbit members of the Fellowship.
|
||||
|
||||
The original Red Book has not been preserved, but many copies were made, especially of the first volume, for the use of the descendants of the children of Master Samwise. The most important copy, however, has a different history. It was kept at Great Smials, but it was written in Condor, probably at the request of the great-grandson of Peregrin, and completed in S.R. 1592 (F.A. 172). Its southern scribe appended this note: Findegil, King's Writer, finished this work in IV 172. It is an exact copy in all details of the Thain's Book m Minas Tirith. That book was a copy, made at the request of King Elessar, of the Red Book of the Periannath, and was brought to him by the Thain Peregrin when he retired to Gondor in IV 64.
|
||||
|
||||
The Thain's Book was thus the first copy made of the Red Book and contained much that was later omitted or lost. In Minas Tirith it received much annotation, and many corrections, especially of names, words, and quotations in the Elvish languages; and there was added to it an abbreviated version of those parts ofThe Tale of Aragorn and Arwen which lie outside the account of the War. The full tale is stated to have been written by Barahir, grandson of the Steward Faramir, some time after the passing of the King. But the chief importance of Findegil's copy is that it alone contains the whole of Bilbo's "Translations from the Elvish'. These three volumes were found to be a work of great skill and learning in which, between 1403 and 1418, he had used all the sources available to him in Rivendell, both living and written. But since they were little used by Frodo, being almost entirely concerned with the Elder Days, no more is said of them here.
|
||||
|
||||
Since Meriadoc and Peregrin became the heads of their great families, and at the same time kept up their connexions with Rohan and Gondor, the libraries at Bucklebury and Tuckborough contained much that did not appear in the Red Book. In Brandy Hall there were many works dealing with Eriador and the history of Rohan. Some of these were composed or begun by Meriadoc himself, though in the Shire he was chiefly remembered for hisHerblore of the Shire, and for hisReckoning of Years m which he discussed the relation of the calendars of the Shire and Bree to those of Rivendell, Gondor, and Rohan. He also wrote a short treatise onOld Words and Names in the Shire, having special interest in discovering the kinship with the language of the Rohirrim of such 'shire-words' asmathom and old elements in place names.
|
||||
|
||||
At Great Smials the books were of less interest to Shire-folk, though more important for larger history. None of them was written by Peregrin, but he and his successors collected many manuscripts written by scribes of Gondor: mainly copies or summaries of histories or legends relating to Elendil and his heirs. Only here in the Shire were to be found extensive materials for the history of Numenor and the arising of Sauron. It was probably at Great Smials thatThe Tale of Years was put together, with the assistance of material collected by Meriadoc. Though the dates given are often conjectural, especially for the Second Age, they deserve attention. It is probable that Meriadoc obtained assistance and information from Rivendell, which he visited more than once. There, though Elrond had departed, his sons long remained, together with some of the High-elven folk. It is said that Celeborn went to dwell there after the departure of Galadriel; but there is no record of the day when at last he sought the Grey Havens, and with him went the last living memory of the Elder Days in Middle-earth.
|
||||
21
basharin_sevastyan_lab_7/README.md
Normal file
@@ -0,0 +1,21 @@
|
||||
## Лабораторная работа 7. Вариант 5.
|
||||
### Задание
|
||||
Выбрать художественный текст(четные варианты –русскоязычный, нечетные –англоязычный)и обучить на нем рекуррентную
|
||||
нейронную сеть для решения задачи генерации. Подобрать архитектуру и параметры так, чтобы приблизиться к максимально
|
||||
осмысленному результату.
|
||||
|
||||
В завершении подобрать компромиссную архитектуру, справляющуюся достаточно хорошо с обоими видами текстов.
|
||||
|
||||
### Ход работы
|
||||
Для английской модели был взят пролог Властелина колец. Модель хоть им получилась удачнее, чем на русском, но время
|
||||
обучение составило чуть больше часа.
|
||||
|
||||
#### Результат rus
|
||||
здесь был человек прежде всего всего обманывает самого себя ибо он думает что успешно соврал а люди поняли и из
|
||||
деликатности промолчали промолчали промолчали промолчали промолчали какие его неудачи могут его постигнуть не тому
|
||||
помочь много ли людей не нуждаются в помощи помощи было врать врать врать молчания молчания а внести то
|
||||
|
||||
#### Результат eng
|
||||
the harfoots were browner of skin smaller and shorter and they were beardless and bootless their hands and feet were
|
||||
neat and nimble and they preferred highlands and hillsides the stoors were broader heavier in build their feet and
|
||||
hands were larger and they preferred flat lands and riversides
|
||||
70
basharin_sevastyan_lab_7/main.py
Normal file
@@ -0,0 +1,70 @@
|
||||
import numpy as np
|
||||
from keras.preprocessing.sequence import pad_sequences
|
||||
from keras.preprocessing.text import Tokenizer
|
||||
from keras.models import Sequential
|
||||
from keras.layers import Embedding, LSTM, Dense
|
||||
from keras.utils import to_categorical
|
||||
|
||||
with open('ru.txt', "r", encoding='utf-8') as file:
|
||||
text = file.read()
|
||||
|
||||
# Предварительная обработка текста (в зависимости от вашей задачи)
|
||||
|
||||
# Создание словаря для отображения слов в индексы и обратно
|
||||
tokenizer = Tokenizer()
|
||||
tokenizer.fit_on_texts([text])
|
||||
total_words = len(tokenizer.word_index) + 1
|
||||
|
||||
# Подготовка данных для обучения (в зависимости от вашей задачи)
|
||||
|
||||
input_sequences = []
|
||||
for line in text.split('\n'):
|
||||
token_list = tokenizer.texts_to_sequences([line])[0]
|
||||
for i in range(1, len(token_list)):
|
||||
n_gram_sequence = token_list[:i+1]
|
||||
input_sequences.append(n_gram_sequence)
|
||||
|
||||
max_sequence_length = max([len(x) for x in input_sequences])
|
||||
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')
|
||||
X, y = input_sequences[:,:-1],input_sequences[:,-1]
|
||||
y = to_categorical(y, num_classes=total_words)
|
||||
|
||||
# Определение архитектуры модели
|
||||
|
||||
model = Sequential()
|
||||
model.add(Embedding(total_words, 50, input_length=max_sequence_length-1))
|
||||
model.add(LSTM(100))
|
||||
model.add(Dense(total_words, activation='softmax'))
|
||||
|
||||
# Компиляция модели
|
||||
|
||||
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
|
||||
|
||||
# Обучение модели
|
||||
|
||||
model.fit(X, y, epochs=100, verbose=2)
|
||||
|
||||
# Генерация текста с использованием обученной модели
|
||||
|
||||
def generate_text(seed_text, next_words, model_, max_sequence_length):
|
||||
for _ in range(next_words):
|
||||
token_list = tokenizer.texts_to_sequences([seed_text])[0]
|
||||
token_list = pad_sequences([token_list], maxlen=max_sequence_length - 1, padding='pre')
|
||||
predicted_probs = model.predict(token_list, verbose=0)[0]
|
||||
predicted_index = np.argmax(predicted_probs)
|
||||
output_word = ""
|
||||
for word, index in tokenizer.word_index.items():
|
||||
if index == predicted_index:
|
||||
output_word = word
|
||||
break
|
||||
seed_text += " " + output_word
|
||||
|
||||
return seed_text
|
||||
|
||||
|
||||
# Пример генерации текста (замените seed_text и next_words на свои значения)
|
||||
seed_text = "здесь был"
|
||||
next_words = 50
|
||||
generated_text = generate_text(seed_text, next_words, model, max_sequence_length)
|
||||
|
||||
print(generated_text)
|
||||
9
basharin_sevastyan_lab_7/ru.txt
Normal file
@@ -0,0 +1,9 @@
|
||||
Когда человек сознательно или интуитивно выбирает себе в жизни какую-то цель, жизненную задачу, он невольно дает себе оценку. По тому, ради чего человек живет, можно судить и о его самооценке - низкой или высокой.
|
||||
Если человек живет, чтобы приносить людям добро, облегчать их страдания, давать людям радость, то он оценивает себя на уровне этой своей человечности. Он ставит себе цель, достойную человека.
|
||||
Только такая цель позволяет человеку прожить свою жизнь с достоинством и получить настоящую радость. Да, радость! Подумайте: если человек ставит себе задачей увеличивать в жизни добро, приносить людям счастье, какие неудачи могут его постигнуть? Не тому помочь? Но много ли людей не нуждаются в помощи?
|
||||
Если жить только для себя, своими мелкими заботами о собственном благополучии, то от прожитого не останется и следа. Если же жить для других, то другие сберегут то, чему служил, чему отдавал силы.
|
||||
Можно по-разному определять цель своего существования, но цель должна быть. Надо иметь и принципы в жизни. Одно правило в жизни должно быть у каждого человека, в его цели жизни, в его принципах жизни, в его поведении: надо прожить жизнь с достоинством, чтобы не стыдно было вспоминать.
|
||||
Достоинство требует доброты, великодушия, умения не быть эгоистом, быть правдивым, хорошим другом, находить радость в помощи другим.
|
||||
Ради достоинства жизни надо уметь отказываться от мелких удовольствий и немалых тоже… Уметь извиняться, признавать перед другими ошибку - лучше, чем врать.
|
||||
Обманывая, человек прежде всего обманывает самого себя, ибо он думает, что успешно соврал, а люди поняли и из деликатности промолчали.
|
||||
Жизнь - прежде всего творчество, но это не значит, что каждый человек, чтобы жить, должен родиться художником, балериной или ученым. Можно творить просто добрую атмосферу вокруг себя. Человек может принести с собой атмосферу подозрительности, какого-то тягостного молчания, а может внести сразу радость, свет. Вот это и есть творчество.
|
||||
60
degtyarev_mikhail_lab_6/Readme.md
Normal file
@@ -0,0 +1,60 @@
|
||||
# Лабораторная 6
|
||||
## Вариант 9
|
||||
|
||||
## Задание
|
||||
Использовать нейронную сеть MLPClassifier для данных из таблицы 1 по варианту, самостоятельно сформулировав задачу. Интерпретировать результаты и оценить, насколько хорошо она подходит для решения сформулированной вами задачи
|
||||
|
||||
Задача:
|
||||
|
||||
Использовать MLPClassifier для прогнозирования заработной платы на основе опыта работы (experience_level), типа занятости (employment_type), местоположения компании (company_location) и размера компании (company_size). Оценить, насколько хорошо нейронная сеть подходит для решения этой задачи.
|
||||
## Описание Программы
|
||||
Программа представляет собой пример использования MLPClassifier для прогнозирования заработной платы на основе различных признаков.
|
||||
### Используемые библиотеки
|
||||
- `pandas`: Библиотека для обработки и анализа данных, используется для загрузки и предобработки данных.
|
||||
- `scikit-learn`:
|
||||
- `train_test_split`: Используется для разделения данных на обучающий и тестовый наборы.
|
||||
- `StandardScaler`: Применяется для нормализации числовых признаков.
|
||||
- `OneHotEncoder`: Используется для кодирования категориальных признаков.
|
||||
- `MLPClassifier`: Классификатор многослойного персептрона (нейронная сеть).
|
||||
- `accuracy_score`: Используется для оценки точности классификации.
|
||||
|
||||
### Шаги программы
|
||||
|
||||
1. **Загрузка данных:**
|
||||
- Загружаются данные из файла `ds_salaries.csv` с использованием библиотеки pandas.
|
||||
|
||||
2. **Определение категорий заработной платы:**
|
||||
- Создаются категории заработной платы на основе бинов с использованием `pd.cut`.
|
||||
|
||||
3. **Добавление столбца с категориями:**
|
||||
- Добавляется столбец с категориями в данные.
|
||||
|
||||
4. **Предварительная обработка данных:**
|
||||
- Категориальные признаки ('experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size') обрабатываются с использованием OneHotEncoder.
|
||||
- Числовые признаки ('work_year', 'remote_ratio') нормализуются с помощью StandardScaler.
|
||||
- Эти шаги объединяются в ColumnTransformer и используются в качестве предварительного обработчика данных.
|
||||
|
||||
5. **Выбор признаков:**
|
||||
- Определены признаки, которые будут использоваться для обучения модели.
|
||||
|
||||
6. **Разделение данных:**
|
||||
- Данные разделены на обучающий и тестовый наборы в соотношении 80/20 с использованием функции `train_test_split`.
|
||||
|
||||
7. **Обучение модели:**
|
||||
- Используется MLPClassifier, объединенный с предварительным обработчиком данных в рамках Pipeline.
|
||||
|
||||
8. **Оценка производительности модели:**
|
||||
- Вычисляется и выводится точность модели с использованием метрики `accuracy_score`.
|
||||
|
||||
### Запуск программы
|
||||
- Склонировать или скачать код `main.py`.
|
||||
- Запустите файл в среде, поддерживающей выполнение Python. `python main.py`
|
||||
|
||||
### Результаты
|
||||
|
||||
- Точность модели оценивается метрикой accuracy, которая может быть выведена в консоль или использована для визуализации.
|
||||
|
||||
В данном случае accuracy получилось: 0.5901639344262295
|
||||
|
||||
Чем ближе результат к единице, тем лучше, но данный результат в 59% можно считать средним.
|
||||
|
||||
608
degtyarev_mikhail_lab_6/ds_salaries.csv
Normal file
@@ -0,0 +1,608 @@
|
||||
,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
|
||||
0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
|
||||
1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
|
||||
2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
|
||||
3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
|
||||
4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L
|
||||
5,2020,EN,FT,Data Analyst,72000,USD,72000,US,100,US,L
|
||||
6,2020,SE,FT,Lead Data Scientist,190000,USD,190000,US,100,US,S
|
||||
7,2020,MI,FT,Data Scientist,11000000,HUF,35735,HU,50,HU,L
|
||||
8,2020,MI,FT,Business Data Analyst,135000,USD,135000,US,100,US,L
|
||||
9,2020,SE,FT,Lead Data Engineer,125000,USD,125000,NZ,50,NZ,S
|
||||
10,2020,EN,FT,Data Scientist,45000,EUR,51321,FR,0,FR,S
|
||||
11,2020,MI,FT,Data Scientist,3000000,INR,40481,IN,0,IN,L
|
||||
12,2020,EN,FT,Data Scientist,35000,EUR,39916,FR,0,FR,M
|
||||
13,2020,MI,FT,Lead Data Analyst,87000,USD,87000,US,100,US,L
|
||||
14,2020,MI,FT,Data Analyst,85000,USD,85000,US,100,US,L
|
||||
15,2020,MI,FT,Data Analyst,8000,USD,8000,PK,50,PK,L
|
||||
16,2020,EN,FT,Data Engineer,4450000,JPY,41689,JP,100,JP,S
|
||||
17,2020,SE,FT,Big Data Engineer,100000,EUR,114047,PL,100,GB,S
|
||||
18,2020,EN,FT,Data Science Consultant,423000,INR,5707,IN,50,IN,M
|
||||
19,2020,MI,FT,Lead Data Engineer,56000,USD,56000,PT,100,US,M
|
||||
20,2020,MI,FT,Machine Learning Engineer,299000,CNY,43331,CN,0,CN,M
|
||||
21,2020,MI,FT,Product Data Analyst,450000,INR,6072,IN,100,IN,L
|
||||
22,2020,SE,FT,Data Engineer,42000,EUR,47899,GR,50,GR,L
|
||||
23,2020,MI,FT,BI Data Analyst,98000,USD,98000,US,0,US,M
|
||||
24,2020,MI,FT,Lead Data Scientist,115000,USD,115000,AE,0,AE,L
|
||||
25,2020,EX,FT,Director of Data Science,325000,USD,325000,US,100,US,L
|
||||
26,2020,EN,FT,Research Scientist,42000,USD,42000,NL,50,NL,L
|
||||
27,2020,SE,FT,Data Engineer,720000,MXN,33511,MX,0,MX,S
|
||||
28,2020,EN,CT,Business Data Analyst,100000,USD,100000,US,100,US,L
|
||||
29,2020,SE,FT,Machine Learning Manager,157000,CAD,117104,CA,50,CA,L
|
||||
30,2020,MI,FT,Data Engineering Manager,51999,EUR,59303,DE,100,DE,S
|
||||
31,2020,EN,FT,Big Data Engineer,70000,USD,70000,US,100,US,L
|
||||
32,2020,SE,FT,Data Scientist,60000,EUR,68428,GR,100,US,L
|
||||
33,2020,MI,FT,Research Scientist,450000,USD,450000,US,0,US,M
|
||||
34,2020,MI,FT,Data Analyst,41000,EUR,46759,FR,50,FR,L
|
||||
35,2020,MI,FT,Data Engineer,65000,EUR,74130,AT,50,AT,L
|
||||
36,2020,MI,FT,Data Science Consultant,103000,USD,103000,US,100,US,L
|
||||
37,2020,EN,FT,Machine Learning Engineer,250000,USD,250000,US,50,US,L
|
||||
38,2020,EN,FT,Data Analyst,10000,USD,10000,NG,100,NG,S
|
||||
39,2020,EN,FT,Machine Learning Engineer,138000,USD,138000,US,100,US,S
|
||||
40,2020,MI,FT,Data Scientist,45760,USD,45760,PH,100,US,S
|
||||
41,2020,EX,FT,Data Engineering Manager,70000,EUR,79833,ES,50,ES,L
|
||||
42,2020,MI,FT,Machine Learning Infrastructure Engineer,44000,EUR,50180,PT,0,PT,M
|
||||
43,2020,MI,FT,Data Engineer,106000,USD,106000,US,100,US,L
|
||||
44,2020,MI,FT,Data Engineer,88000,GBP,112872,GB,50,GB,L
|
||||
45,2020,EN,PT,ML Engineer,14000,EUR,15966,DE,100,DE,S
|
||||
46,2020,MI,FT,Data Scientist,60000,GBP,76958,GB,100,GB,S
|
||||
47,2020,SE,FT,Data Engineer,188000,USD,188000,US,100,US,L
|
||||
48,2020,MI,FT,Data Scientist,105000,USD,105000,US,100,US,L
|
||||
49,2020,MI,FT,Data Engineer,61500,EUR,70139,FR,50,FR,L
|
||||
50,2020,EN,FT,Data Analyst,450000,INR,6072,IN,0,IN,S
|
||||
51,2020,EN,FT,Data Analyst,91000,USD,91000,US,100,US,L
|
||||
52,2020,EN,FT,AI Scientist,300000,DKK,45896,DK,50,DK,S
|
||||
53,2020,EN,FT,Data Engineer,48000,EUR,54742,PK,100,DE,L
|
||||
54,2020,SE,FL,Computer Vision Engineer,60000,USD,60000,RU,100,US,S
|
||||
55,2020,SE,FT,Principal Data Scientist,130000,EUR,148261,DE,100,DE,M
|
||||
56,2020,MI,FT,Data Scientist,34000,EUR,38776,ES,100,ES,M
|
||||
57,2020,MI,FT,Data Scientist,118000,USD,118000,US,100,US,M
|
||||
58,2020,SE,FT,Data Scientist,120000,USD,120000,US,50,US,L
|
||||
59,2020,MI,FT,Data Scientist,138350,USD,138350,US,100,US,M
|
||||
60,2020,MI,FT,Data Engineer,110000,USD,110000,US,100,US,L
|
||||
61,2020,MI,FT,Data Engineer,130800,USD,130800,ES,100,US,M
|
||||
62,2020,EN,PT,Data Scientist,19000,EUR,21669,IT,50,IT,S
|
||||
63,2020,SE,FT,Data Scientist,412000,USD,412000,US,100,US,L
|
||||
64,2020,SE,FT,Machine Learning Engineer,40000,EUR,45618,HR,100,HR,S
|
||||
65,2020,EN,FT,Data Scientist,55000,EUR,62726,DE,50,DE,S
|
||||
66,2020,EN,FT,Data Scientist,43200,EUR,49268,DE,0,DE,S
|
||||
67,2020,SE,FT,Data Science Manager,190200,USD,190200,US,100,US,M
|
||||
68,2020,EN,FT,Data Scientist,105000,USD,105000,US,100,US,S
|
||||
69,2020,SE,FT,Data Scientist,80000,EUR,91237,AT,0,AT,S
|
||||
70,2020,MI,FT,Data Scientist,55000,EUR,62726,FR,50,LU,S
|
||||
71,2020,MI,FT,Data Scientist,37000,EUR,42197,FR,50,FR,S
|
||||
72,2021,EN,FT,Research Scientist,60000,GBP,82528,GB,50,GB,L
|
||||
73,2021,EX,FT,BI Data Analyst,150000,USD,150000,IN,100,US,L
|
||||
74,2021,EX,FT,Head of Data,235000,USD,235000,US,100,US,L
|
||||
75,2021,SE,FT,Data Scientist,45000,EUR,53192,FR,50,FR,L
|
||||
76,2021,MI,FT,BI Data Analyst,100000,USD,100000,US,100,US,M
|
||||
77,2021,MI,PT,3D Computer Vision Researcher,400000,INR,5409,IN,50,IN,M
|
||||
78,2021,MI,CT,ML Engineer,270000,USD,270000,US,100,US,L
|
||||
79,2021,EN,FT,Data Analyst,80000,USD,80000,US,100,US,M
|
||||
80,2021,SE,FT,Data Analytics Engineer,67000,EUR,79197,DE,100,DE,L
|
||||
81,2021,MI,FT,Data Engineer,140000,USD,140000,US,100,US,L
|
||||
82,2021,MI,FT,Applied Data Scientist,68000,CAD,54238,GB,50,CA,L
|
||||
83,2021,MI,FT,Machine Learning Engineer,40000,EUR,47282,ES,100,ES,S
|
||||
84,2021,EX,FT,Director of Data Science,130000,EUR,153667,IT,100,PL,L
|
||||
85,2021,MI,FT,Data Engineer,110000,PLN,28476,PL,100,PL,L
|
||||
86,2021,EN,FT,Data Analyst,50000,EUR,59102,FR,50,FR,M
|
||||
87,2021,MI,FT,Data Analytics Engineer,110000,USD,110000,US,100,US,L
|
||||
88,2021,SE,FT,Lead Data Analyst,170000,USD,170000,US,100,US,L
|
||||
89,2021,SE,FT,Data Analyst,80000,USD,80000,BG,100,US,S
|
||||
90,2021,SE,FT,Marketing Data Analyst,75000,EUR,88654,GR,100,DK,L
|
||||
91,2021,EN,FT,Data Science Consultant,65000,EUR,76833,DE,100,DE,S
|
||||
92,2021,MI,FT,Lead Data Analyst,1450000,INR,19609,IN,100,IN,L
|
||||
93,2021,SE,FT,Lead Data Engineer,276000,USD,276000,US,0,US,L
|
||||
94,2021,EN,FT,Data Scientist,2200000,INR,29751,IN,50,IN,L
|
||||
95,2021,MI,FT,Cloud Data Engineer,120000,SGD,89294,SG,50,SG,L
|
||||
96,2021,EN,PT,AI Scientist,12000,USD,12000,BR,100,US,S
|
||||
97,2021,MI,FT,Financial Data Analyst,450000,USD,450000,US,100,US,L
|
||||
98,2021,EN,FT,Computer Vision Software Engineer,70000,USD,70000,US,100,US,M
|
||||
99,2021,MI,FT,Computer Vision Software Engineer,81000,EUR,95746,DE,100,US,S
|
||||
100,2021,MI,FT,Data Analyst,75000,USD,75000,US,0,US,L
|
||||
101,2021,SE,FT,Data Engineer,150000,USD,150000,US,100,US,L
|
||||
102,2021,MI,FT,BI Data Analyst,11000000,HUF,36259,HU,50,US,L
|
||||
103,2021,MI,FT,Data Analyst,62000,USD,62000,US,0,US,L
|
||||
104,2021,MI,FT,Data Scientist,73000,USD,73000,US,0,US,L
|
||||
105,2021,MI,FT,Data Analyst,37456,GBP,51519,GB,50,GB,L
|
||||
106,2021,MI,FT,Research Scientist,235000,CAD,187442,CA,100,CA,L
|
||||
107,2021,SE,FT,Data Engineer,115000,USD,115000,US,100,US,S
|
||||
108,2021,SE,FT,Data Engineer,150000,USD,150000,US,100,US,M
|
||||
109,2021,EN,FT,Data Engineer,2250000,INR,30428,IN,100,IN,L
|
||||
110,2021,SE,FT,Machine Learning Engineer,80000,EUR,94564,DE,50,DE,L
|
||||
111,2021,SE,FT,Director of Data Engineering,82500,GBP,113476,GB,100,GB,M
|
||||
112,2021,SE,FT,Lead Data Engineer,75000,GBP,103160,GB,100,GB,S
|
||||
113,2021,EN,PT,AI Scientist,12000,USD,12000,PK,100,US,M
|
||||
114,2021,MI,FT,Data Engineer,38400,EUR,45391,NL,100,NL,L
|
||||
115,2021,EN,FT,Machine Learning Scientist,225000,USD,225000,US,100,US,L
|
||||
116,2021,MI,FT,Data Scientist,50000,USD,50000,NG,100,NG,L
|
||||
117,2021,MI,FT,Data Science Engineer,34000,EUR,40189,GR,100,GR,M
|
||||
118,2021,EN,FT,Data Analyst,90000,USD,90000,US,100,US,S
|
||||
119,2021,MI,FT,Data Engineer,200000,USD,200000,US,100,US,L
|
||||
120,2021,MI,FT,Big Data Engineer,60000,USD,60000,ES,50,RO,M
|
||||
121,2021,SE,FT,Principal Data Engineer,200000,USD,200000,US,100,US,M
|
||||
122,2021,EN,FT,Data Analyst,50000,USD,50000,US,100,US,M
|
||||
123,2021,EN,FT,Applied Data Scientist,80000,GBP,110037,GB,0,GB,L
|
||||
124,2021,EN,PT,Data Analyst,8760,EUR,10354,ES,50,ES,M
|
||||
125,2021,MI,FT,Principal Data Scientist,151000,USD,151000,US,100,US,L
|
||||
126,2021,SE,FT,Machine Learning Scientist,120000,USD,120000,US,50,US,S
|
||||
127,2021,MI,FT,Data Scientist,700000,INR,9466,IN,0,IN,S
|
||||
128,2021,EN,FT,Machine Learning Engineer,20000,USD,20000,IN,100,IN,S
|
||||
129,2021,SE,FT,Lead Data Scientist,3000000,INR,40570,IN,50,IN,L
|
||||
130,2021,EN,FT,Machine Learning Developer,100000,USD,100000,IQ,50,IQ,S
|
||||
131,2021,EN,FT,Data Scientist,42000,EUR,49646,FR,50,FR,M
|
||||
132,2021,MI,FT,Applied Machine Learning Scientist,38400,USD,38400,VN,100,US,M
|
||||
133,2021,SE,FT,Computer Vision Engineer,24000,USD,24000,BR,100,BR,M
|
||||
134,2021,EN,FT,Data Scientist,100000,USD,100000,US,0,US,S
|
||||
135,2021,MI,FT,Data Analyst,90000,USD,90000,US,100,US,M
|
||||
136,2021,MI,FT,ML Engineer,7000000,JPY,63711,JP,50,JP,S
|
||||
137,2021,MI,FT,ML Engineer,8500000,JPY,77364,JP,50,JP,S
|
||||
138,2021,SE,FT,Principal Data Scientist,220000,USD,220000,US,0,US,L
|
||||
139,2021,EN,FT,Data Scientist,80000,USD,80000,US,100,US,M
|
||||
140,2021,MI,FT,Data Analyst,135000,USD,135000,US,100,US,L
|
||||
141,2021,SE,FT,Data Science Manager,240000,USD,240000,US,0,US,L
|
||||
142,2021,SE,FT,Data Engineering Manager,150000,USD,150000,US,0,US,L
|
||||
143,2021,MI,FT,Data Scientist,82500,USD,82500,US,100,US,S
|
||||
144,2021,MI,FT,Data Engineer,100000,USD,100000,US,100,US,L
|
||||
145,2021,SE,FT,Machine Learning Engineer,70000,EUR,82744,BE,50,BE,M
|
||||
146,2021,MI,FT,Research Scientist,53000,EUR,62649,FR,50,FR,M
|
||||
147,2021,MI,FT,Data Engineer,90000,USD,90000,US,100,US,L
|
||||
148,2021,SE,FT,Data Engineering Manager,153000,USD,153000,US,100,US,L
|
||||
149,2021,SE,FT,Cloud Data Engineer,160000,USD,160000,BR,100,US,S
|
||||
150,2021,SE,FT,Director of Data Science,168000,USD,168000,JP,0,JP,S
|
||||
151,2021,MI,FT,Data Scientist,150000,USD,150000,US,100,US,M
|
||||
152,2021,MI,FT,Data Scientist,95000,CAD,75774,CA,100,CA,L
|
||||
153,2021,EN,FT,Data Scientist,13400,USD,13400,UA,100,UA,L
|
||||
154,2021,SE,FT,Data Science Manager,144000,USD,144000,US,100,US,L
|
||||
155,2021,SE,FT,Data Science Engineer,159500,CAD,127221,CA,50,CA,L
|
||||
156,2021,MI,FT,Data Scientist,160000,SGD,119059,SG,100,IL,M
|
||||
157,2021,MI,FT,Applied Machine Learning Scientist,423000,USD,423000,US,50,US,L
|
||||
158,2021,SE,FT,Data Analytics Manager,120000,USD,120000,US,100,US,M
|
||||
159,2021,EN,FT,Machine Learning Engineer,125000,USD,125000,US,100,US,S
|
||||
160,2021,EX,FT,Head of Data,230000,USD,230000,RU,50,RU,L
|
||||
161,2021,EX,FT,Head of Data Science,85000,USD,85000,RU,0,RU,M
|
||||
162,2021,MI,FT,Data Engineer,24000,EUR,28369,MT,50,MT,L
|
||||
163,2021,EN,FT,Data Science Consultant,54000,EUR,63831,DE,50,DE,L
|
||||
164,2021,EX,FT,Director of Data Science,110000,EUR,130026,DE,50,DE,M
|
||||
165,2021,SE,FT,Data Specialist,165000,USD,165000,US,100,US,L
|
||||
166,2021,EN,FT,Data Engineer,80000,USD,80000,US,100,US,L
|
||||
167,2021,EX,FT,Director of Data Science,250000,USD,250000,US,0,US,L
|
||||
168,2021,EN,FT,BI Data Analyst,55000,USD,55000,US,50,US,S
|
||||
169,2021,MI,FT,Data Architect,150000,USD,150000,US,100,US,L
|
||||
170,2021,MI,FT,Data Architect,170000,USD,170000,US,100,US,L
|
||||
171,2021,MI,FT,Data Engineer,60000,GBP,82528,GB,100,GB,L
|
||||
172,2021,EN,FT,Data Analyst,60000,USD,60000,US,100,US,S
|
||||
173,2021,SE,FT,Principal Data Scientist,235000,USD,235000,US,100,US,L
|
||||
174,2021,SE,FT,Research Scientist,51400,EUR,60757,PT,50,PT,L
|
||||
175,2021,SE,FT,Data Engineering Manager,174000,USD,174000,US,100,US,L
|
||||
176,2021,MI,FT,Data Scientist,58000,MXN,2859,MX,0,MX,S
|
||||
177,2021,MI,FT,Data Scientist,30400000,CLP,40038,CL,100,CL,L
|
||||
178,2021,EN,FT,Machine Learning Engineer,81000,USD,81000,US,50,US,S
|
||||
179,2021,MI,FT,Data Scientist,420000,INR,5679,IN,100,US,S
|
||||
180,2021,MI,FT,Big Data Engineer,1672000,INR,22611,IN,0,IN,L
|
||||
181,2021,MI,FT,Data Scientist,76760,EUR,90734,DE,50,DE,L
|
||||
182,2021,MI,FT,Data Engineer,22000,EUR,26005,RO,0,US,L
|
||||
183,2021,SE,FT,Finance Data Analyst,45000,GBP,61896,GB,50,GB,L
|
||||
184,2021,MI,FL,Machine Learning Scientist,12000,USD,12000,PK,50,PK,M
|
||||
185,2021,MI,FT,Data Engineer,4000,USD,4000,IR,100,IR,M
|
||||
186,2021,SE,FT,Data Analytics Engineer,50000,USD,50000,VN,100,GB,M
|
||||
187,2021,EX,FT,Data Science Consultant,59000,EUR,69741,FR,100,ES,S
|
||||
188,2021,SE,FT,Data Engineer,65000,EUR,76833,RO,50,GB,S
|
||||
189,2021,MI,FT,Machine Learning Engineer,74000,USD,74000,JP,50,JP,S
|
||||
190,2021,SE,FT,Data Science Manager,152000,USD,152000,US,100,FR,L
|
||||
191,2021,EN,FT,Machine Learning Engineer,21844,USD,21844,CO,50,CO,M
|
||||
192,2021,MI,FT,Big Data Engineer,18000,USD,18000,MD,0,MD,S
|
||||
193,2021,SE,FT,Data Science Manager,174000,USD,174000,US,100,US,L
|
||||
194,2021,SE,FT,Research Scientist,120500,CAD,96113,CA,50,CA,L
|
||||
195,2021,MI,FT,Data Scientist,147000,USD,147000,US,50,US,L
|
||||
196,2021,EN,FT,BI Data Analyst,9272,USD,9272,KE,100,KE,S
|
||||
197,2021,SE,FT,Machine Learning Engineer,1799997,INR,24342,IN,100,IN,L
|
||||
198,2021,SE,FT,Data Science Manager,4000000,INR,54094,IN,50,US,L
|
||||
199,2021,EN,FT,Data Science Consultant,90000,USD,90000,US,100,US,S
|
||||
200,2021,MI,FT,Data Scientist,52000,EUR,61467,DE,50,AT,M
|
||||
201,2021,SE,FT,Machine Learning Infrastructure Engineer,195000,USD,195000,US,100,US,M
|
||||
202,2021,MI,FT,Data Scientist,32000,EUR,37825,ES,100,ES,L
|
||||
203,2021,SE,FT,Research Scientist,50000,USD,50000,FR,100,US,S
|
||||
204,2021,MI,FT,Data Scientist,160000,USD,160000,US,100,US,L
|
||||
205,2021,MI,FT,Data Scientist,69600,BRL,12901,BR,0,BR,S
|
||||
206,2021,SE,FT,Machine Learning Engineer,200000,USD,200000,US,100,US,L
|
||||
207,2021,SE,FT,Data Engineer,165000,USD,165000,US,0,US,M
|
||||
208,2021,MI,FL,Data Engineer,20000,USD,20000,IT,0,US,L
|
||||
209,2021,SE,FT,Data Analytics Manager,120000,USD,120000,US,0,US,L
|
||||
210,2021,MI,FT,Machine Learning Engineer,21000,EUR,24823,SI,50,SI,L
|
||||
211,2021,MI,FT,Research Scientist,48000,EUR,56738,FR,50,FR,S
|
||||
212,2021,MI,FT,Data Engineer,48000,GBP,66022,HK,50,GB,S
|
||||
213,2021,EN,FT,Big Data Engineer,435000,INR,5882,IN,0,CH,L
|
||||
214,2021,EN,FT,Machine Learning Engineer,21000,EUR,24823,DE,50,DE,M
|
||||
215,2021,SE,FT,Principal Data Engineer,185000,USD,185000,US,100,US,L
|
||||
216,2021,EN,PT,Computer Vision Engineer,180000,DKK,28609,DK,50,DK,S
|
||||
217,2021,MI,FT,Data Scientist,76760,EUR,90734,DE,50,DE,L
|
||||
218,2021,MI,FT,Machine Learning Engineer,75000,EUR,88654,BE,100,BE,M
|
||||
219,2021,SE,FT,Data Analytics Manager,140000,USD,140000,US,100,US,L
|
||||
220,2021,MI,FT,Machine Learning Engineer,180000,PLN,46597,PL,100,PL,L
|
||||
221,2021,MI,FT,Data Scientist,85000,GBP,116914,GB,50,GB,L
|
||||
222,2021,MI,FT,Data Scientist,2500000,INR,33808,IN,0,IN,M
|
||||
223,2021,MI,FT,Data Scientist,40900,GBP,56256,GB,50,GB,L
|
||||
224,2021,SE,FT,Machine Learning Scientist,225000,USD,225000,US,100,CA,L
|
||||
225,2021,EX,CT,Principal Data Scientist,416000,USD,416000,US,100,US,S
|
||||
226,2021,SE,FT,Data Scientist,110000,CAD,87738,CA,100,CA,S
|
||||
227,2021,MI,FT,Data Scientist,75000,EUR,88654,DE,50,DE,L
|
||||
228,2021,SE,FT,Data Scientist,135000,USD,135000,US,0,US,L
|
||||
229,2021,SE,FT,Data Analyst,90000,CAD,71786,CA,100,CA,M
|
||||
230,2021,EN,FT,Big Data Engineer,1200000,INR,16228,IN,100,IN,L
|
||||
231,2021,SE,FT,ML Engineer,256000,USD,256000,US,100,US,S
|
||||
232,2021,SE,FT,Director of Data Engineering,200000,USD,200000,US,100,US,L
|
||||
233,2021,SE,FT,Data Analyst,200000,USD,200000,US,100,US,L
|
||||
234,2021,MI,FT,Data Architect,180000,USD,180000,US,100,US,L
|
||||
235,2021,MI,FT,Head of Data Science,110000,USD,110000,US,0,US,S
|
||||
236,2021,MI,FT,Research Scientist,80000,CAD,63810,CA,100,CA,M
|
||||
237,2021,MI,FT,Data Scientist,39600,EUR,46809,ES,100,ES,M
|
||||
238,2021,EN,FT,Data Scientist,4000,USD,4000,VN,0,VN,M
|
||||
239,2021,EN,FT,Data Engineer,1600000,INR,21637,IN,50,IN,M
|
||||
240,2021,SE,FT,Data Scientist,130000,CAD,103691,CA,100,CA,L
|
||||
241,2021,MI,FT,Data Analyst,80000,USD,80000,US,100,US,L
|
||||
242,2021,MI,FT,Data Engineer,110000,USD,110000,US,100,US,L
|
||||
243,2021,SE,FT,Data Scientist,165000,USD,165000,US,100,US,L
|
||||
244,2021,EN,FT,AI Scientist,1335000,INR,18053,IN,100,AS,S
|
||||
245,2021,MI,FT,Data Engineer,52500,GBP,72212,GB,50,GB,L
|
||||
246,2021,EN,FT,Data Scientist,31000,EUR,36643,FR,50,FR,L
|
||||
247,2021,MI,FT,Data Engineer,108000,TRY,12103,TR,0,TR,M
|
||||
248,2021,SE,FT,Data Engineer,70000,GBP,96282,GB,50,GB,L
|
||||
249,2021,SE,FT,Principal Data Analyst,170000,USD,170000,US,100,US,M
|
||||
250,2021,MI,FT,Data Scientist,115000,USD,115000,US,50,US,L
|
||||
251,2021,EN,FT,Data Scientist,90000,USD,90000,US,100,US,S
|
||||
252,2021,EX,FT,Principal Data Engineer,600000,USD,600000,US,100,US,L
|
||||
253,2021,EN,FT,Data Scientist,2100000,INR,28399,IN,100,IN,M
|
||||
254,2021,MI,FT,Data Analyst,93000,USD,93000,US,100,US,L
|
||||
255,2021,SE,FT,Big Data Architect,125000,CAD,99703,CA,50,CA,M
|
||||
256,2021,MI,FT,Data Engineer,200000,USD,200000,US,100,US,L
|
||||
257,2021,SE,FT,Principal Data Scientist,147000,EUR,173762,DE,100,DE,M
|
||||
258,2021,SE,FT,Machine Learning Engineer,185000,USD,185000,US,50,US,L
|
||||
259,2021,EX,FT,Director of Data Science,120000,EUR,141846,DE,0,DE,L
|
||||
260,2021,MI,FT,Data Scientist,130000,USD,130000,US,50,US,L
|
||||
261,2021,SE,FT,Data Analyst,54000,EUR,63831,DE,50,DE,L
|
||||
262,2021,MI,FT,Data Scientist,1250000,INR,16904,IN,100,IN,S
|
||||
263,2021,SE,FT,Machine Learning Engineer,4900000,INR,66265,IN,0,IN,L
|
||||
264,2021,MI,FT,Data Scientist,21600,EUR,25532,RS,100,DE,S
|
||||
265,2021,SE,FT,Lead Data Engineer,160000,USD,160000,PR,50,US,S
|
||||
266,2021,MI,FT,Data Engineer,93150,USD,93150,US,0,US,M
|
||||
267,2021,MI,FT,Data Engineer,111775,USD,111775,US,0,US,M
|
||||
268,2021,MI,FT,Data Engineer,250000,TRY,28016,TR,100,TR,M
|
||||
269,2021,EN,FT,Data Engineer,55000,EUR,65013,DE,50,DE,M
|
||||
270,2021,EN,FT,Data Engineer,72500,USD,72500,US,100,US,L
|
||||
271,2021,SE,FT,Computer Vision Engineer,102000,BRL,18907,BR,0,BR,M
|
||||
272,2021,EN,FT,Data Science Consultant,65000,EUR,76833,DE,0,DE,L
|
||||
273,2021,EN,FT,Machine Learning Engineer,85000,USD,85000,NL,100,DE,S
|
||||
274,2021,SE,FT,Data Scientist,65720,EUR,77684,FR,50,FR,M
|
||||
275,2021,EN,FT,Data Scientist,100000,USD,100000,US,100,US,M
|
||||
276,2021,EN,FT,Data Scientist,58000,USD,58000,US,50,US,L
|
||||
277,2021,SE,FT,AI Scientist,55000,USD,55000,ES,100,ES,L
|
||||
278,2021,SE,FT,Data Scientist,180000,TRY,20171,TR,50,TR,L
|
||||
279,2021,EN,FT,Business Data Analyst,50000,EUR,59102,LU,100,LU,L
|
||||
280,2021,MI,FT,Data Engineer,112000,USD,112000,US,100,US,L
|
||||
281,2021,EN,FT,Research Scientist,100000,USD,100000,JE,0,CN,L
|
||||
282,2021,MI,PT,Data Engineer,59000,EUR,69741,NL,100,NL,L
|
||||
283,2021,SE,CT,Staff Data Scientist,105000,USD,105000,US,100,US,M
|
||||
284,2021,MI,FT,Research Scientist,69999,USD,69999,CZ,50,CZ,L
|
||||
285,2021,SE,FT,Data Science Manager,7000000,INR,94665,IN,50,IN,L
|
||||
286,2021,SE,FT,Head of Data,87000,EUR,102839,SI,100,SI,L
|
||||
287,2021,MI,FT,Data Scientist,109000,USD,109000,US,50,US,L
|
||||
288,2021,MI,FT,Machine Learning Engineer,43200,EUR,51064,IT,50,IT,L
|
||||
289,2022,SE,FT,Data Engineer,135000,USD,135000,US,100,US,M
|
||||
290,2022,SE,FT,Data Analyst,155000,USD,155000,US,100,US,M
|
||||
291,2022,SE,FT,Data Analyst,120600,USD,120600,US,100,US,M
|
||||
292,2022,MI,FT,Data Scientist,130000,USD,130000,US,0,US,M
|
||||
293,2022,MI,FT,Data Scientist,90000,USD,90000,US,0,US,M
|
||||
294,2022,MI,FT,Data Engineer,170000,USD,170000,US,100,US,M
|
||||
295,2022,MI,FT,Data Engineer,150000,USD,150000,US,100,US,M
|
||||
296,2022,SE,FT,Data Analyst,102100,USD,102100,US,100,US,M
|
||||
297,2022,SE,FT,Data Analyst,84900,USD,84900,US,100,US,M
|
||||
298,2022,SE,FT,Data Scientist,136620,USD,136620,US,100,US,M
|
||||
299,2022,SE,FT,Data Scientist,99360,USD,99360,US,100,US,M
|
||||
300,2022,SE,FT,Data Scientist,90000,GBP,117789,GB,0,GB,M
|
||||
301,2022,SE,FT,Data Scientist,80000,GBP,104702,GB,0,GB,M
|
||||
302,2022,SE,FT,Data Scientist,146000,USD,146000,US,100,US,M
|
||||
303,2022,SE,FT,Data Scientist,123000,USD,123000,US,100,US,M
|
||||
304,2022,EN,FT,Data Engineer,40000,GBP,52351,GB,100,GB,M
|
||||
305,2022,SE,FT,Data Analyst,99000,USD,99000,US,0,US,M
|
||||
306,2022,SE,FT,Data Analyst,116000,USD,116000,US,0,US,M
|
||||
307,2022,MI,FT,Data Analyst,106260,USD,106260,US,0,US,M
|
||||
308,2022,MI,FT,Data Analyst,126500,USD,126500,US,0,US,M
|
||||
309,2022,EX,FT,Data Engineer,242000,USD,242000,US,100,US,M
|
||||
310,2022,EX,FT,Data Engineer,200000,USD,200000,US,100,US,M
|
||||
311,2022,MI,FT,Data Scientist,50000,GBP,65438,GB,0,GB,M
|
||||
312,2022,MI,FT,Data Scientist,30000,GBP,39263,GB,0,GB,M
|
||||
313,2022,MI,FT,Data Engineer,60000,GBP,78526,GB,0,GB,M
|
||||
314,2022,MI,FT,Data Engineer,40000,GBP,52351,GB,0,GB,M
|
||||
315,2022,SE,FT,Data Scientist,165220,USD,165220,US,100,US,M
|
||||
316,2022,EN,FT,Data Engineer,35000,GBP,45807,GB,100,GB,M
|
||||
317,2022,SE,FT,Data Scientist,120160,USD,120160,US,100,US,M
|
||||
318,2022,SE,FT,Data Analyst,90320,USD,90320,US,100,US,M
|
||||
319,2022,SE,FT,Data Engineer,181940,USD,181940,US,0,US,M
|
||||
320,2022,SE,FT,Data Engineer,132320,USD,132320,US,0,US,M
|
||||
321,2022,SE,FT,Data Engineer,220110,USD,220110,US,0,US,M
|
||||
322,2022,SE,FT,Data Engineer,160080,USD,160080,US,0,US,M
|
||||
323,2022,SE,FT,Data Scientist,180000,USD,180000,US,0,US,L
|
||||
324,2022,SE,FT,Data Scientist,120000,USD,120000,US,0,US,L
|
||||
325,2022,SE,FT,Data Analyst,124190,USD,124190,US,100,US,M
|
||||
326,2022,EX,FT,Data Analyst,130000,USD,130000,US,100,US,M
|
||||
327,2022,EX,FT,Data Analyst,110000,USD,110000,US,100,US,M
|
||||
328,2022,SE,FT,Data Analyst,170000,USD,170000,US,100,US,M
|
||||
329,2022,MI,FT,Data Analyst,115500,USD,115500,US,100,US,M
|
||||
330,2022,SE,FT,Data Analyst,112900,USD,112900,US,100,US,M
|
||||
331,2022,SE,FT,Data Analyst,90320,USD,90320,US,100,US,M
|
||||
332,2022,SE,FT,Data Analyst,112900,USD,112900,US,100,US,M
|
||||
333,2022,SE,FT,Data Analyst,90320,USD,90320,US,100,US,M
|
||||
334,2022,SE,FT,Data Engineer,165400,USD,165400,US,100,US,M
|
||||
335,2022,SE,FT,Data Engineer,132320,USD,132320,US,100,US,M
|
||||
336,2022,MI,FT,Data Analyst,167000,USD,167000,US,100,US,M
|
||||
337,2022,SE,FT,Data Engineer,243900,USD,243900,US,100,US,M
|
||||
338,2022,SE,FT,Data Analyst,136600,USD,136600,US,100,US,M
|
||||
339,2022,SE,FT,Data Analyst,109280,USD,109280,US,100,US,M
|
||||
340,2022,SE,FT,Data Engineer,128875,USD,128875,US,100,US,M
|
||||
341,2022,SE,FT,Data Engineer,93700,USD,93700,US,100,US,M
|
||||
342,2022,EX,FT,Head of Data Science,224000,USD,224000,US,100,US,M
|
||||
343,2022,EX,FT,Head of Data Science,167875,USD,167875,US,100,US,M
|
||||
344,2022,EX,FT,Analytics Engineer,175000,USD,175000,US,100,US,M
|
||||
345,2022,SE,FT,Data Engineer,156600,USD,156600,US,100,US,M
|
||||
346,2022,SE,FT,Data Engineer,108800,USD,108800,US,0,US,M
|
||||
347,2022,SE,FT,Data Scientist,95550,USD,95550,US,0,US,M
|
||||
348,2022,SE,FT,Data Engineer,113000,USD,113000,US,0,US,L
|
||||
349,2022,SE,FT,Data Analyst,135000,USD,135000,US,100,US,M
|
||||
350,2022,SE,FT,Data Science Manager,161342,USD,161342,US,100,US,M
|
||||
351,2022,SE,FT,Data Science Manager,137141,USD,137141,US,100,US,M
|
||||
352,2022,SE,FT,Data Scientist,167000,USD,167000,US,100,US,M
|
||||
353,2022,SE,FT,Data Scientist,123000,USD,123000,US,100,US,M
|
||||
354,2022,SE,FT,Data Engineer,60000,GBP,78526,GB,0,GB,M
|
||||
355,2022,SE,FT,Data Engineer,50000,GBP,65438,GB,0,GB,M
|
||||
356,2022,SE,FT,Data Scientist,150000,USD,150000,US,0,US,M
|
||||
357,2022,SE,FT,Data Scientist,211500,USD,211500,US,100,US,M
|
||||
358,2022,SE,FT,Data Architect,192400,USD,192400,CA,100,CA,M
|
||||
359,2022,SE,FT,Data Architect,90700,USD,90700,CA,100,CA,M
|
||||
360,2022,SE,FT,Data Analyst,130000,USD,130000,CA,100,CA,M
|
||||
361,2022,SE,FT,Data Analyst,61300,USD,61300,CA,100,CA,M
|
||||
362,2022,SE,FT,Data Analyst,130000,USD,130000,CA,100,CA,M
|
||||
363,2022,SE,FT,Data Analyst,61300,USD,61300,CA,100,CA,M
|
||||
364,2022,SE,FT,Data Engineer,160000,USD,160000,US,0,US,L
|
||||
365,2022,SE,FT,Data Scientist,138600,USD,138600,US,100,US,M
|
||||
366,2022,SE,FT,Data Engineer,136000,USD,136000,US,0,US,M
|
||||
367,2022,MI,FT,Data Analyst,58000,USD,58000,US,0,US,S
|
||||
368,2022,EX,FT,Analytics Engineer,135000,USD,135000,US,100,US,M
|
||||
369,2022,SE,FT,Data Scientist,170000,USD,170000,US,100,US,M
|
||||
370,2022,SE,FT,Data Scientist,123000,USD,123000,US,100,US,M
|
||||
371,2022,SE,FT,Machine Learning Engineer,189650,USD,189650,US,0,US,M
|
||||
372,2022,SE,FT,Machine Learning Engineer,164996,USD,164996,US,0,US,M
|
||||
373,2022,MI,FT,ETL Developer,50000,EUR,54957,GR,0,GR,M
|
||||
374,2022,MI,FT,ETL Developer,50000,EUR,54957,GR,0,GR,M
|
||||
375,2022,EX,FT,Lead Data Engineer,150000,CAD,118187,CA,100,CA,S
|
||||
376,2022,SE,FT,Data Analyst,132000,USD,132000,US,0,US,M
|
||||
377,2022,SE,FT,Data Engineer,165400,USD,165400,US,100,US,M
|
||||
378,2022,SE,FT,Data Architect,208775,USD,208775,US,100,US,M
|
||||
379,2022,SE,FT,Data Architect,147800,USD,147800,US,100,US,M
|
||||
380,2022,SE,FT,Data Engineer,136994,USD,136994,US,100,US,M
|
||||
381,2022,SE,FT,Data Engineer,101570,USD,101570,US,100,US,M
|
||||
382,2022,SE,FT,Data Analyst,128875,USD,128875,US,100,US,M
|
||||
383,2022,SE,FT,Data Analyst,93700,USD,93700,US,100,US,M
|
||||
384,2022,EX,FT,Head of Machine Learning,6000000,INR,79039,IN,50,IN,L
|
||||
385,2022,SE,FT,Data Engineer,132320,USD,132320,US,100,US,M
|
||||
386,2022,EN,FT,Machine Learning Engineer,28500,GBP,37300,GB,100,GB,L
|
||||
387,2022,SE,FT,Data Analyst,164000,USD,164000,US,0,US,M
|
||||
388,2022,SE,FT,Data Engineer,155000,USD,155000,US,100,US,M
|
||||
389,2022,MI,FT,Machine Learning Engineer,95000,GBP,124333,GB,0,GB,M
|
||||
390,2022,MI,FT,Machine Learning Engineer,75000,GBP,98158,GB,0,GB,M
|
||||
391,2022,MI,FT,AI Scientist,120000,USD,120000,US,0,US,M
|
||||
392,2022,SE,FT,Data Analyst,112900,USD,112900,US,100,US,M
|
||||
393,2022,SE,FT,Data Analyst,90320,USD,90320,US,100,US,M
|
||||
394,2022,SE,FT,Data Analytics Manager,145000,USD,145000,US,100,US,M
|
||||
395,2022,SE,FT,Data Analytics Manager,105400,USD,105400,US,100,US,M
|
||||
396,2022,MI,FT,Machine Learning Engineer,80000,EUR,87932,FR,100,DE,M
|
||||
397,2022,MI,FT,Data Engineer,90000,GBP,117789,GB,0,GB,M
|
||||
398,2022,SE,FT,Data Scientist,215300,USD,215300,US,100,US,L
|
||||
399,2022,SE,FT,Data Scientist,158200,USD,158200,US,100,US,L
|
||||
400,2022,SE,FT,Data Engineer,209100,USD,209100,US,100,US,L
|
||||
401,2022,SE,FT,Data Engineer,154600,USD,154600,US,100,US,L
|
||||
402,2022,SE,FT,Data Analyst,115934,USD,115934,US,0,US,M
|
||||
403,2022,SE,FT,Data Analyst,81666,USD,81666,US,0,US,M
|
||||
404,2022,SE,FT,Data Engineer,175000,USD,175000,US,100,US,M
|
||||
405,2022,MI,FT,Data Engineer,75000,GBP,98158,GB,0,GB,M
|
||||
406,2022,MI,FT,Data Analyst,58000,USD,58000,US,0,US,S
|
||||
407,2022,SE,FT,Data Engineer,183600,USD,183600,US,100,US,L
|
||||
408,2022,MI,FT,Data Analyst,40000,GBP,52351,GB,100,GB,M
|
||||
409,2022,SE,FT,Data Scientist,180000,USD,180000,US,100,US,M
|
||||
410,2022,MI,FT,Data Scientist,55000,GBP,71982,GB,0,GB,M
|
||||
411,2022,MI,FT,Data Scientist,35000,GBP,45807,GB,0,GB,M
|
||||
412,2022,MI,FT,Data Engineer,60000,EUR,65949,GR,100,GR,M
|
||||
413,2022,MI,FT,Data Engineer,45000,EUR,49461,GR,100,GR,M
|
||||
414,2022,MI,FT,Data Engineer,60000,GBP,78526,GB,100,GB,M
|
||||
415,2022,MI,FT,Data Engineer,45000,GBP,58894,GB,100,GB,M
|
||||
416,2022,SE,FT,Data Scientist,260000,USD,260000,US,100,US,M
|
||||
417,2022,SE,FT,Data Science Engineer,60000,USD,60000,AR,100,MX,L
|
||||
418,2022,MI,FT,Data Engineer,63900,USD,63900,US,0,US,M
|
||||
419,2022,MI,FT,Machine Learning Scientist,160000,USD,160000,US,100,US,L
|
||||
420,2022,MI,FT,Machine Learning Scientist,112300,USD,112300,US,100,US,L
|
||||
421,2022,MI,FT,Data Science Manager,241000,USD,241000,US,100,US,M
|
||||
422,2022,MI,FT,Data Science Manager,159000,USD,159000,US,100,US,M
|
||||
423,2022,SE,FT,Data Scientist,180000,USD,180000,US,0,US,M
|
||||
424,2022,SE,FT,Data Scientist,80000,USD,80000,US,0,US,M
|
||||
425,2022,MI,FT,Data Engineer,82900,USD,82900,US,0,US,M
|
||||
426,2022,SE,FT,Data Engineer,100800,USD,100800,US,100,US,L
|
||||
427,2022,MI,FT,Data Engineer,45000,EUR,49461,ES,100,ES,M
|
||||
428,2022,SE,FT,Data Scientist,140400,USD,140400,US,0,US,L
|
||||
429,2022,MI,FT,Data Analyst,30000,GBP,39263,GB,100,GB,M
|
||||
430,2022,MI,FT,Data Analyst,40000,EUR,43966,ES,100,ES,M
|
||||
431,2022,MI,FT,Data Analyst,30000,EUR,32974,ES,100,ES,M
|
||||
432,2022,MI,FT,Data Engineer,80000,EUR,87932,ES,100,ES,M
|
||||
433,2022,MI,FT,Data Engineer,70000,EUR,76940,ES,100,ES,M
|
||||
434,2022,MI,FT,Data Engineer,80000,GBP,104702,GB,100,GB,M
|
||||
435,2022,MI,FT,Data Engineer,70000,GBP,91614,GB,100,GB,M
|
||||
436,2022,MI,FT,Data Engineer,60000,EUR,65949,ES,100,ES,M
|
||||
437,2022,MI,FT,Data Engineer,80000,EUR,87932,GR,100,GR,M
|
||||
438,2022,SE,FT,Machine Learning Engineer,189650,USD,189650,US,0,US,M
|
||||
439,2022,SE,FT,Machine Learning Engineer,164996,USD,164996,US,0,US,M
|
||||
440,2022,MI,FT,Data Analyst,40000,EUR,43966,GR,100,GR,M
|
||||
441,2022,MI,FT,Data Analyst,30000,EUR,32974,GR,100,GR,M
|
||||
442,2022,MI,FT,Data Engineer,75000,GBP,98158,GB,100,GB,M
|
||||
443,2022,MI,FT,Data Engineer,60000,GBP,78526,GB,100,GB,M
|
||||
444,2022,SE,FT,Data Scientist,215300,USD,215300,US,0,US,L
|
||||
445,2022,MI,FT,Data Engineer,70000,EUR,76940,GR,100,GR,M
|
||||
446,2022,SE,FT,Data Engineer,209100,USD,209100,US,100,US,L
|
||||
447,2022,SE,FT,Data Engineer,154600,USD,154600,US,100,US,L
|
||||
448,2022,SE,FT,Data Engineer,180000,USD,180000,US,100,US,M
|
||||
449,2022,EN,FT,ML Engineer,20000,EUR,21983,PT,100,PT,L
|
||||
450,2022,SE,FT,Data Engineer,80000,USD,80000,US,100,US,M
|
||||
451,2022,MI,FT,Machine Learning Developer,100000,CAD,78791,CA,100,CA,M
|
||||
452,2022,EX,FT,Director of Data Science,250000,CAD,196979,CA,50,CA,L
|
||||
453,2022,MI,FT,Machine Learning Engineer,120000,USD,120000,US,100,US,S
|
||||
454,2022,EN,FT,Computer Vision Engineer,125000,USD,125000,US,0,US,M
|
||||
455,2022,MI,FT,NLP Engineer,240000,CNY,37236,US,50,US,L
|
||||
456,2022,SE,FT,Data Engineer,105000,USD,105000,US,100,US,M
|
||||
457,2022,SE,FT,Lead Machine Learning Engineer,80000,EUR,87932,DE,0,DE,M
|
||||
458,2022,MI,FT,Business Data Analyst,1400000,INR,18442,IN,100,IN,M
|
||||
459,2022,MI,FT,Data Scientist,2400000,INR,31615,IN,100,IN,L
|
||||
460,2022,MI,FT,Machine Learning Infrastructure Engineer,53000,EUR,58255,PT,50,PT,L
|
||||
461,2022,EN,FT,Financial Data Analyst,100000,USD,100000,US,50,US,L
|
||||
462,2022,MI,PT,Data Engineer,50000,EUR,54957,DE,50,DE,L
|
||||
463,2022,EN,FT,Data Scientist,1400000,INR,18442,IN,100,IN,M
|
||||
464,2022,SE,FT,Principal Data Scientist,148000,EUR,162674,DE,100,DE,M
|
||||
465,2022,EN,FT,Data Engineer,120000,USD,120000,US,100,US,M
|
||||
466,2022,SE,FT,Research Scientist,144000,USD,144000,US,50,US,L
|
||||
467,2022,SE,FT,Data Scientist,104890,USD,104890,US,100,US,M
|
||||
468,2022,SE,FT,Data Engineer,100000,USD,100000,US,100,US,M
|
||||
469,2022,SE,FT,Data Scientist,140000,USD,140000,US,100,US,M
|
||||
470,2022,MI,FT,Data Analyst,135000,USD,135000,US,100,US,M
|
||||
471,2022,MI,FT,Data Analyst,50000,USD,50000,US,100,US,M
|
||||
472,2022,SE,FT,Data Scientist,220000,USD,220000,US,100,US,M
|
||||
473,2022,SE,FT,Data Scientist,140000,USD,140000,US,100,US,M
|
||||
474,2022,MI,FT,Data Scientist,140000,GBP,183228,GB,0,GB,M
|
||||
475,2022,MI,FT,Data Scientist,70000,GBP,91614,GB,0,GB,M
|
||||
476,2022,SE,FT,Data Scientist,185100,USD,185100,US,100,US,M
|
||||
477,2022,SE,FT,Machine Learning Engineer,220000,USD,220000,US,100,US,M
|
||||
478,2022,MI,FT,Data Scientist,200000,USD,200000,US,100,US,M
|
||||
479,2022,MI,FT,Data Scientist,120000,USD,120000,US,100,US,M
|
||||
480,2022,SE,FT,Machine Learning Engineer,120000,USD,120000,AE,100,AE,S
|
||||
481,2022,SE,FT,Machine Learning Engineer,65000,USD,65000,AE,100,AE,S
|
||||
482,2022,EX,FT,Data Engineer,324000,USD,324000,US,100,US,M
|
||||
483,2022,EX,FT,Data Engineer,216000,USD,216000,US,100,US,M
|
||||
484,2022,SE,FT,Data Engineer,210000,USD,210000,US,100,US,M
|
||||
485,2022,SE,FT,Machine Learning Engineer,120000,USD,120000,US,100,US,M
|
||||
486,2022,SE,FT,Data Scientist,230000,USD,230000,US,100,US,M
|
||||
487,2022,EN,PT,Data Scientist,100000,USD,100000,DZ,50,DZ,M
|
||||
488,2022,MI,FL,Data Scientist,100000,USD,100000,CA,100,US,M
|
||||
489,2022,EN,CT,Applied Machine Learning Scientist,29000,EUR,31875,TN,100,CZ,M
|
||||
490,2022,SE,FT,Head of Data,200000,USD,200000,MY,100,US,M
|
||||
491,2022,MI,FT,Principal Data Analyst,75000,USD,75000,CA,100,CA,S
|
||||
492,2022,MI,FT,Data Scientist,150000,PLN,35590,PL,100,PL,L
|
||||
493,2022,SE,FT,Machine Learning Developer,100000,CAD,78791,CA,100,CA,M
|
||||
494,2022,SE,FT,Data Scientist,100000,USD,100000,BR,100,US,M
|
||||
495,2022,MI,FT,Machine Learning Scientist,153000,USD,153000,US,50,US,M
|
||||
496,2022,EN,FT,Data Engineer,52800,EUR,58035,PK,100,DE,M
|
||||
497,2022,SE,FT,Data Scientist,165000,USD,165000,US,100,US,M
|
||||
498,2022,SE,FT,Research Scientist,85000,EUR,93427,FR,50,FR,L
|
||||
499,2022,EN,FT,Data Scientist,66500,CAD,52396,CA,100,CA,L
|
||||
500,2022,SE,FT,Machine Learning Engineer,57000,EUR,62651,NL,100,NL,L
|
||||
501,2022,MI,FT,Head of Data,30000,EUR,32974,EE,100,EE,S
|
||||
502,2022,EN,FT,Data Scientist,40000,USD,40000,JP,100,MY,L
|
||||
503,2022,MI,FT,Machine Learning Engineer,121000,AUD,87425,AU,100,AU,L
|
||||
504,2022,SE,FT,Data Engineer,115000,USD,115000,US,100,US,M
|
||||
505,2022,EN,FT,Data Scientist,120000,AUD,86703,AU,50,AU,M
|
||||
506,2022,MI,FT,Applied Machine Learning Scientist,75000,USD,75000,BO,100,US,L
|
||||
507,2022,MI,FT,Research Scientist,59000,EUR,64849,AT,0,AT,L
|
||||
508,2022,EN,FT,Research Scientist,120000,USD,120000,US,100,US,L
|
||||
509,2022,MI,FT,Applied Data Scientist,157000,USD,157000,US,100,US,L
|
||||
510,2022,EN,FT,Computer Vision Software Engineer,150000,USD,150000,AU,100,AU,S
|
||||
511,2022,MI,FT,Business Data Analyst,90000,CAD,70912,CA,50,CA,L
|
||||
512,2022,EN,FT,Data Engineer,65000,USD,65000,US,100,US,S
|
||||
513,2022,SE,FT,Machine Learning Engineer,65000,EUR,71444,IE,100,IE,S
|
||||
514,2022,EN,FT,Data Analytics Engineer,20000,USD,20000,PK,0,PK,M
|
||||
515,2022,MI,FT,Data Scientist,48000,USD,48000,RU,100,US,S
|
||||
516,2022,SE,FT,Data Science Manager,152500,USD,152500,US,100,US,M
|
||||
517,2022,MI,FT,Data Engineer,62000,EUR,68147,FR,100,FR,M
|
||||
518,2022,MI,FT,Data Scientist,115000,CHF,122346,CH,0,CH,L
|
||||
519,2022,SE,FT,Applied Data Scientist,380000,USD,380000,US,100,US,L
|
||||
520,2022,MI,FT,Data Scientist,88000,CAD,69336,CA,100,CA,M
|
||||
521,2022,EN,FT,Computer Vision Engineer,10000,USD,10000,PT,100,LU,M
|
||||
522,2022,MI,FT,Data Analyst,20000,USD,20000,GR,100,GR,S
|
||||
523,2022,SE,FT,Data Analytics Lead,405000,USD,405000,US,100,US,L
|
||||
524,2022,MI,FT,Data Scientist,135000,USD,135000,US,100,US,L
|
||||
525,2022,SE,FT,Applied Data Scientist,177000,USD,177000,US,100,US,L
|
||||
526,2022,MI,FT,Data Scientist,78000,USD,78000,US,100,US,M
|
||||
527,2022,SE,FT,Data Analyst,135000,USD,135000,US,100,US,M
|
||||
528,2022,SE,FT,Data Analyst,100000,USD,100000,US,100,US,M
|
||||
529,2022,SE,FT,Data Analyst,90320,USD,90320,US,100,US,M
|
||||
530,2022,MI,FT,Data Analyst,85000,USD,85000,CA,0,CA,M
|
||||
531,2022,MI,FT,Data Analyst,75000,USD,75000,CA,0,CA,M
|
||||
532,2022,SE,FT,Machine Learning Engineer,214000,USD,214000,US,100,US,M
|
||||
533,2022,SE,FT,Machine Learning Engineer,192600,USD,192600,US,100,US,M
|
||||
534,2022,SE,FT,Data Architect,266400,USD,266400,US,100,US,M
|
||||
535,2022,SE,FT,Data Architect,213120,USD,213120,US,100,US,M
|
||||
536,2022,SE,FT,Data Analyst,112900,USD,112900,US,100,US,M
|
||||
537,2022,SE,FT,Data Engineer,155000,USD,155000,US,100,US,M
|
||||
538,2022,MI,FT,Data Scientist,141300,USD,141300,US,0,US,M
|
||||
539,2022,MI,FT,Data Scientist,102100,USD,102100,US,0,US,M
|
||||
540,2022,SE,FT,Data Analyst,115934,USD,115934,US,100,US,M
|
||||
541,2022,SE,FT,Data Analyst,81666,USD,81666,US,100,US,M
|
||||
542,2022,MI,FT,Data Engineer,206699,USD,206699,US,0,US,M
|
||||
543,2022,MI,FT,Data Engineer,99100,USD,99100,US,0,US,M
|
||||
544,2022,SE,FT,Data Engineer,130000,USD,130000,US,100,US,M
|
||||
545,2022,SE,FT,Data Engineer,115000,USD,115000,US,100,US,M
|
||||
546,2022,SE,FT,Data Engineer,110500,USD,110500,US,100,US,M
|
||||
547,2022,SE,FT,Data Engineer,130000,USD,130000,US,100,US,M
|
||||
548,2022,SE,FT,Data Analyst,99050,USD,99050,US,100,US,M
|
||||
549,2022,SE,FT,Data Engineer,160000,USD,160000,US,100,US,M
|
||||
550,2022,SE,FT,Data Scientist,205300,USD,205300,US,0,US,L
|
||||
551,2022,SE,FT,Data Scientist,140400,USD,140400,US,0,US,L
|
||||
552,2022,SE,FT,Data Scientist,176000,USD,176000,US,100,US,M
|
||||
553,2022,SE,FT,Data Scientist,144000,USD,144000,US,100,US,M
|
||||
554,2022,SE,FT,Data Engineer,200100,USD,200100,US,100,US,M
|
||||
555,2022,SE,FT,Data Engineer,160000,USD,160000,US,100,US,M
|
||||
556,2022,SE,FT,Data Engineer,145000,USD,145000,US,100,US,M
|
||||
557,2022,SE,FT,Data Engineer,70500,USD,70500,US,0,US,M
|
||||
558,2022,SE,FT,Data Scientist,205300,USD,205300,US,0,US,M
|
||||
559,2022,SE,FT,Data Scientist,140400,USD,140400,US,0,US,M
|
||||
560,2022,SE,FT,Analytics Engineer,205300,USD,205300,US,0,US,M
|
||||
561,2022,SE,FT,Analytics Engineer,184700,USD,184700,US,0,US,M
|
||||
562,2022,SE,FT,Data Engineer,175100,USD,175100,US,100,US,M
|
||||
563,2022,SE,FT,Data Engineer,140250,USD,140250,US,100,US,M
|
||||
564,2022,SE,FT,Data Analyst,116150,USD,116150,US,100,US,M
|
||||
565,2022,SE,FT,Data Engineer,54000,USD,54000,US,0,US,M
|
||||
566,2022,SE,FT,Data Analyst,170000,USD,170000,US,100,US,M
|
||||
567,2022,MI,FT,Data Analyst,50000,GBP,65438,GB,0,GB,M
|
||||
568,2022,SE,FT,Data Analyst,80000,USD,80000,US,100,US,M
|
||||
569,2022,SE,FT,Data Scientist,140000,USD,140000,US,100,US,M
|
||||
570,2022,SE,FT,Data Scientist,210000,USD,210000,US,100,US,M
|
||||
571,2022,SE,FT,Data Scientist,140000,USD,140000,US,100,US,M
|
||||
572,2022,SE,FT,Data Analyst,100000,USD,100000,US,100,US,M
|
||||
573,2022,SE,FT,Data Analyst,69000,USD,69000,US,100,US,M
|
||||
574,2022,SE,FT,Data Scientist,210000,USD,210000,US,100,US,M
|
||||
575,2022,SE,FT,Data Scientist,140000,USD,140000,US,100,US,M
|
||||
576,2022,SE,FT,Data Scientist,210000,USD,210000,US,100,US,M
|
||||
577,2022,SE,FT,Data Analyst,150075,USD,150075,US,100,US,M
|
||||
578,2022,SE,FT,Data Engineer,100000,USD,100000,US,100,US,M
|
||||
579,2022,SE,FT,Data Engineer,25000,USD,25000,US,100,US,M
|
||||
580,2022,SE,FT,Data Analyst,126500,USD,126500,US,100,US,M
|
||||
581,2022,SE,FT,Data Analyst,106260,USD,106260,US,100,US,M
|
||||
582,2022,SE,FT,Data Engineer,220110,USD,220110,US,100,US,M
|
||||
583,2022,SE,FT,Data Engineer,160080,USD,160080,US,100,US,M
|
||||
584,2022,SE,FT,Data Analyst,105000,USD,105000,US,100,US,M
|
||||
585,2022,SE,FT,Data Analyst,110925,USD,110925,US,100,US,M
|
||||
586,2022,MI,FT,Data Analyst,35000,GBP,45807,GB,0,GB,M
|
||||
587,2022,SE,FT,Data Scientist,140000,USD,140000,US,100,US,M
|
||||
588,2022,SE,FT,Data Analyst,99000,USD,99000,US,0,US,M
|
||||
589,2022,SE,FT,Data Analyst,60000,USD,60000,US,100,US,M
|
||||
590,2022,SE,FT,Data Architect,192564,USD,192564,US,100,US,M
|
||||
591,2022,SE,FT,Data Architect,144854,USD,144854,US,100,US,M
|
||||
592,2022,SE,FT,Data Scientist,230000,USD,230000,US,100,US,M
|
||||
593,2022,SE,FT,Data Scientist,150000,USD,150000,US,100,US,M
|
||||
594,2022,SE,FT,Data Analytics Manager,150260,USD,150260,US,100,US,M
|
||||
595,2022,SE,FT,Data Analytics Manager,109280,USD,109280,US,100,US,M
|
||||
596,2022,SE,FT,Data Scientist,210000,USD,210000,US,100,US,M
|
||||
597,2022,SE,FT,Data Analyst,170000,USD,170000,US,100,US,M
|
||||
598,2022,MI,FT,Data Scientist,160000,USD,160000,US,100,US,M
|
||||
599,2022,MI,FT,Data Scientist,130000,USD,130000,US,100,US,M
|
||||
600,2022,EN,FT,Data Analyst,67000,USD,67000,CA,0,CA,M
|
||||
601,2022,EN,FT,Data Analyst,52000,USD,52000,CA,0,CA,M
|
||||
602,2022,SE,FT,Data Engineer,154000,USD,154000,US,100,US,M
|
||||
603,2022,SE,FT,Data Engineer,126000,USD,126000,US,100,US,M
|
||||
604,2022,SE,FT,Data Analyst,129000,USD,129000,US,0,US,M
|
||||
605,2022,SE,FT,Data Analyst,150000,USD,150000,US,100,US,M
|
||||
606,2022,MI,FT,AI Scientist,200000,USD,200000,IN,100,US,L
|
||||
|
60
degtyarev_mikhail_lab_6/main.py
Normal file
@@ -0,0 +1,60 @@
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.linear_model import Lasso
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.pipeline import Pipeline
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# Загрузка данных
|
||||
file_path = 'ds_salaries.csv'
|
||||
data = pd.read_csv(file_path)
|
||||
|
||||
# Предварительная обработка данных
|
||||
categorical_features = ['experience_level', 'employment_type', 'company_location', 'company_size']
|
||||
numeric_features = ['work_year']
|
||||
|
||||
preprocessor = ColumnTransformer(
|
||||
transformers=[
|
||||
('num', StandardScaler(), numeric_features),
|
||||
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
|
||||
])
|
||||
|
||||
# Выбор признаков
|
||||
features = ['work_year', 'experience_level', 'employment_type', 'company_location', 'company_size']
|
||||
X = data[features]
|
||||
y = data['salary_in_usd']
|
||||
|
||||
# Разделение данных на обучающий и тестовый наборы
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
||||
|
||||
# Создание и обучение модели с использованием предварительного обработчика данных
|
||||
alpha = 0.01
|
||||
lasso_model = Pipeline([
|
||||
('preprocessor', preprocessor),
|
||||
('lasso', Lasso(alpha=alpha))
|
||||
])
|
||||
|
||||
lasso_model.fit(X_train, y_train)
|
||||
|
||||
# Получение прогнозов
|
||||
y_pred = lasso_model.predict(X_test)
|
||||
|
||||
# Оценка точности модели
|
||||
accuracy = lasso_model.score(X_test, y_test)
|
||||
mse = mean_squared_error(y_test, y_pred)
|
||||
|
||||
print(f"R^2 Score: {accuracy:.2f}")
|
||||
print(f"Mean Squared Error: {mse:.2f}")
|
||||
|
||||
# Вывод предсказанных и фактических значений
|
||||
predictions_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
|
||||
print(predictions_df)
|
||||
|
||||
# Визуализация весов (коэффициентов) модели
|
||||
coefficients = pd.Series(lasso_model.named_steps['lasso'].coef_, index=numeric_features + list(lasso_model.named_steps['preprocessor'].transformers_[1][1].get_feature_names(categorical_features)))
|
||||
plt.figure(figsize=(10, 6))
|
||||
coefficients.sort_values().plot(kind='barh')
|
||||
plt.title('Lasso Regression Coefficients')
|
||||
plt.show()
|
||||
55
degtyarev_mikhail_lab_7/Readme.md
Normal file
@@ -0,0 +1,55 @@
|
||||
# Лабораторная 7
|
||||
## Вариант 9
|
||||
|
||||
## Задание
|
||||
Выбрать художественный текст (четные варианты – русскоязычный, нечетные – англоязычный) и обучить на нем рекуррентную нейронную сеть для решения задачи генерации. Подобрать архитектуру и параметры так, чтобы приблизиться к максимально осмысленному результату.Далее разбиться на пары четный-нечетный вариант, обменяться разработанными сетями и проверить, как архитектура товарища справляется с вашим текстом.
|
||||
|
||||
## Описание Программы
|
||||
Программа представляет собой пример использования рекуррентной нейронной сети (LSTM) для генерации текста на основе художественного произведения.
|
||||
### Используемые библиотеки
|
||||
- `numpy`: Библиотека для работы с многомерными массивами и математическими функциями.
|
||||
- `keras`:
|
||||
- `Sequential`: Модель нейронной сети, представляющая собой линейный стек слоев.
|
||||
- `Embedding`: Слой для преобразования целых чисел (индексов слов) в плотные вектора фиксированной размерности.
|
||||
- `LSTM`: Рекуррентный слой долгой краткосрочной памяти.
|
||||
- `Dense`: Полносвязный слой с активацией softmax для генерации вероятностного распределения слов.
|
||||
- `Tokenizer`, `pad_sequences`: Инструменты для токенизации и последовательной обработки текста.
|
||||
|
||||
### Шаги программы
|
||||
|
||||
1. **Загрузка данных:**
|
||||
- Текст загружается из файла `text.txt` (англоязычный текст) с использованием стандартных средств языка Python.
|
||||
|
||||
2. **Подготовка данных для обучения:**
|
||||
- Текст разбивается на последовательности токенов для обучения рекуррентной нейронной сети.
|
||||
- Используется `Tokenizer` для создания словаря и преобразования текста в числовое представление.
|
||||
- Последовательности дополняются до максимальной длины с использованием `pad_sequences`.
|
||||
|
||||
3. **Создание и компиляция модели:**
|
||||
- Создается последовательная модель с вложенным слоем, рекуррентным слоем LSTM и полносвязным слоем.
|
||||
- Модель компилируется с использованием категориальной кросс-энтропии в качестве функции потерь и оптимизатора Adam.
|
||||
|
||||
4. **Обучение модели:**
|
||||
- Модель обучается на подготовленных данных в течение 100 эпох.
|
||||
|
||||
5. **Оценка производительности модели:**
|
||||
- Выводится окончательная ошибка на обучающих данных.
|
||||
|
||||
6. **Генерация текста:**
|
||||
- Создается начальный текст "Amidst the golden hues of autumn leaves".
|
||||
- Модель используется для предсказания следующего слова в последовательности.
|
||||
- Сгенерированный текст выводится на экран.
|
||||
|
||||
### Запуск программы
|
||||
- Замените `'text.txt'` на актуальный путь или имя вашего файла с англоязычным текстом.
|
||||
- Склонируйте или скачайте код из файла `main.py`.
|
||||
- Запустите файл в среде, поддерживающей выполнение Python. `python main.py`
|
||||
|
||||
|
||||
### Результаты
|
||||
|
||||
Потери на тренировочных данных составили не такое большое значение: 0.029374321853453274327
|
||||
|
||||
Результат сгенерированного англоязычного текста:
|
||||
In the quietude of the woods, mystical creatures stirred, their silhouettes dancing in the dappling sunlight. A mysterious energy enveloped the surroundings, as if the very essence of nature had come alive. The rustling leaves seemed to carry ancient tales, whispered secrets of times long past. Each step through the foliage unveiled a new chapter in the enchanted story of the woodland realm.
|
||||
|
||||
60
degtyarev_mikhail_lab_7/main.py
Normal file
@@ -0,0 +1,60 @@
|
||||
import numpy as np
|
||||
from keras.models import Sequential
|
||||
from keras.layers import Embedding, LSTM, Dense
|
||||
from keras.preprocessing.text import Tokenizer
|
||||
from keras.preprocessing.sequence import pad_sequences
|
||||
|
||||
# Load the text
|
||||
with open('text.txt', 'r', encoding='utf-8') as file:
|
||||
text = file.read()
|
||||
|
||||
tokenizer = Tokenizer()
|
||||
tokenizer.fit_on_texts([text])
|
||||
total_words = len(tokenizer.word_index) + 1
|
||||
|
||||
# Create the sequence of training data
|
||||
input_sequences = []
|
||||
for line in text.split('\n'):
|
||||
token_list = tokenizer.texts_to_sequences([line])[0]
|
||||
for i in range(1, len(token_list)):
|
||||
n_gram_sequence = token_list[:i+1]
|
||||
input_sequences.append(n_gram_sequence)
|
||||
|
||||
# Padding sequences
|
||||
max_sequence_length = max([len(seq) for seq in input_sequences])
|
||||
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')
|
||||
|
||||
# Create input and output data
|
||||
X, y = input_sequences[:, :-1], input_sequences[:, -1]
|
||||
y = np.eye(total_words)[y]
|
||||
|
||||
# Create the model
|
||||
model = Sequential()
|
||||
model.add(Embedding(total_words, 50, input_length=max_sequence_length-1))
|
||||
model.add(LSTM(100))
|
||||
model.add(Dense(total_words, activation='softmax'))
|
||||
|
||||
# Compile the model
|
||||
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
|
||||
|
||||
# Train the model
|
||||
history = model.fit(X, y, epochs=100, verbose=2)
|
||||
|
||||
print(f"Final Loss on Training Data: {history.history['loss'][-1]}")
|
||||
|
||||
# Generate text
|
||||
seed_text = "Amidst the golden hues of autumn leaves"
|
||||
next_words = 100
|
||||
|
||||
for _ in range(next_words):
|
||||
token_list = tokenizer.texts_to_sequences([seed_text])[0]
|
||||
token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
|
||||
predicted = model.predict_classes(token_list, verbose=0)
|
||||
output_word = ""
|
||||
for word, index in tokenizer.word_index.items():
|
||||
if index == predicted:
|
||||
output_word = word
|
||||
break
|
||||
seed_text += " " + output_word
|
||||
|
||||
print(seed_text)
|
||||
1
degtyarev_mikhail_lab_7/text.txt
Normal file
@@ -0,0 +1 @@
|
||||
Amidst the golden hues of autumn leaves, a gentle breeze whispered through the trees. The air was filled with the sweet fragrance of blooming flowers, and the sun cast a warm glow on the peaceful landscape. Birds chirped melodiously, creating a symphony of nature's harmonious melodies. As the day unfolded, the sky painted itself in vibrant shades of orange and pink, showcasing the breathtaking beauty of the changing seasons.
|
||||
BIN
gordeeva_anna_lab_6/1aIk7s_b66s.jpg
Normal file
|
After Width: | Height: | Size: 28 KiB |
23
gordeeva_anna_lab_6/README.md
Normal file
@@ -0,0 +1,23 @@
|
||||
## Данные
|
||||
Я использую следующие данные:
|
||||
* Ссылка на изображение картины
|
||||
* Размер картины в см
|
||||
* Средняя оценка по отзывам
|
||||
* Количество заказов
|
||||
* Стоимость
|
||||
Чтобы сделать анализ конкретнее были добавлены вручную следующие
|
||||
данные:
|
||||
* Жанр (Например: пейзаж, животные, портрет и т.д)
|
||||
* Поджанр (Например: городской пейзаж, коты, собаки и т.д)
|
||||
|
||||
## Задание и решение классификации (нейронная сеть)
|
||||
Необходимо посоветовать/предсказать пользователю поджанр на основе выбранного
|
||||
жанра и категории стоимости. Нет необходимости разбивать на группы, так как сам
|
||||
параметр является категориальным. Для выполнения классификации все категориальные
|
||||
параметры переводим в числа. Точность модель не превышает 0.30, что можно сказать,
|
||||
что модель не удачная. На это влияет то, что в данные достаточно много классов, что
|
||||
делает модель сложнее. Результат предсказания представлен на рисунке 5 и 6
|
||||
|
||||
## Результат
|
||||

|
||||

|
||||
BIN
gordeeva_anna_lab_6/cAofDwrO6o4.jpg
Normal file
|
After Width: | Height: | Size: 26 KiB |
74
gordeeva_anna_lab_6/laba6.py
Normal file
@@ -0,0 +1,74 @@
|
||||
import pandas as pd
|
||||
import streamlit as st
|
||||
import statsmodels.api as sm
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.metrics import accuracy_score
|
||||
import numpy as np
|
||||
|
||||
data = pd.read_csv('222.csv')
|
||||
|
||||
genre_mapping = {genre: code for code, genre in enumerate(data['Жанр'].unique())}
|
||||
subgenre_mapping = {subgenre: code for code, subgenre in enumerate(data['Поджанр'].unique())}
|
||||
price_mapping = {price: code for code, price in enumerate(data['Категория стоимости'].unique())}
|
||||
|
||||
# Преобразование категориальных значений
|
||||
data['Жанр'] = data['Жанр'].map(genre_mapping)
|
||||
data['Поджанр'] = data['Поджанр'].map(subgenre_mapping)
|
||||
data['Категория стоимости'] = data['Категория стоимости'].map(price_mapping)
|
||||
|
||||
columns_to_check = ['Размер', 'Жанр', 'Поджанр', 'Категория стоимости']
|
||||
data = data.dropna(subset=columns_to_check)
|
||||
|
||||
# Разделение данных на признаки (X) и целевую переменную (y)
|
||||
X = data[['Жанр', 'Категория стоимости']]
|
||||
y = data['Поджанр']
|
||||
|
||||
# Разделение на обучающий и тестовый наборы
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
||||
|
||||
# Нормализация данных
|
||||
scaler = StandardScaler()
|
||||
X_train = scaler.fit_transform(X_train)
|
||||
X_test = scaler.transform(X_test)
|
||||
|
||||
# Инициализация MLPClassifier
|
||||
clf = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, alpha=0.1, solver='adam', random_state=42)
|
||||
|
||||
# Обучение модели
|
||||
clf.fit(X_train, y_train)
|
||||
|
||||
# Предсказание на тестовом наборе
|
||||
predictions = clf.predict(X_test)
|
||||
|
||||
# Оценка точности модели
|
||||
accuracy = accuracy_score(y_test, predictions)
|
||||
st.write(f"Точность модели: {accuracy}")
|
||||
|
||||
on_pred = st.toggle('')
|
||||
if on_pred:
|
||||
selected_genre = st.selectbox('Выберите жанр:', genre_mapping)
|
||||
selected_price = st.selectbox('Выберите категорию стоимости:', price_mapping)
|
||||
|
||||
new_data = pd.DataFrame({'Жанр': [selected_genre], 'Категория стоимости': [selected_price]}, index=[0])
|
||||
|
||||
new_data['Жанр'] = new_data['Жанр'].map(genre_mapping)
|
||||
new_data['Категория стоимости'] = new_data['Категория стоимости'].map(price_mapping)
|
||||
|
||||
new_data_normalized = scaler.transform(new_data.values)
|
||||
new_predictions = clf.predict(new_data_normalized)
|
||||
|
||||
# Создание обратного словаря для обратного маппинга числовых кодов поджанров в текстовые метки
|
||||
reverse_subgenre_mapping = {code: subgenre for subgenre, code in subgenre_mapping.items()}
|
||||
|
||||
# Преобразование числовых предсказаний обратно в текстовые метки поджанров
|
||||
predicted_subgenres = [reverse_subgenre_mapping[code] for code in new_predictions]
|
||||
|
||||
# Вывод предсказанных поджанров для новых данных
|
||||
st.write("Предсказанный поджанр:")
|
||||
for subgenre in predicted_subgenres:
|
||||
if isinstance(subgenre, float) and np.isnan(subgenre):
|
||||
st.write("Не удалось предсказать, мало данных по данному жанру")
|
||||
else:
|
||||
st.write(subgenre)
|
||||
43
gordeeva_anna_lab_7/README.md
Normal file
@@ -0,0 +1,43 @@
|
||||
## Задание
|
||||
Выбрать художественный текст и обучить на нем рекуррентную нейронную сеть для решения задачи генерации.
|
||||
|
||||
## Зависимости
|
||||
Для работы этого приложения необходимы следующие библиотеки Python:
|
||||
* NumPy
|
||||
* TensorFlow
|
||||
* Streamlit
|
||||
|
||||
## Запуск
|
||||
```bash
|
||||
streamlit laba7.py
|
||||
```
|
||||
|
||||
## Описание кода
|
||||
1. Импорт библиотек:
|
||||
|
||||
Импортируются необходимые библиотеки, такие как docx для чтения текстов из файлов Word, streamlit для создания веб-приложения, numpy, tensorflow и keras для обучения нейронных сетей.
|
||||
|
||||
2. Извлечение текста из файлов Word:
|
||||
|
||||
Функция extract_text_from_docx используется для извлечения текста из двух файлов Word на русском (textru) и английском (texten). Это делается с помощью библиотеки docx.
|
||||
|
||||
3. Подготовка данных для обучения моделей:
|
||||
|
||||
Текст из файлов разбивается на последовательности для обучения рекуррентных нейронных сетей (LSTM). Текст разбивается на последовательности определенной длины (maxlen) и используется для обучения моделей на русском и английском текстах.
|
||||
|
||||
4. Создание и обучение моделей:
|
||||
|
||||
Два отдельных экземпляра модели (model_russian и model_english) создаются и обучаются на соответствующих данных русского и английского текстов.
|
||||
|
||||
5. Генерация текста на основе обученных моделей:
|
||||
|
||||
Функция generate_text используется для генерации текста на основе обученных моделей. Этот текст выводится с помощью streamlit в веб-приложении.
|
||||
|
||||
## Результат
|
||||
Сгенерированный русский текст:
|
||||
|
||||
Ты к моему несчастью верь как в святыню верит монах как в чудо чудо верит дева как верят в вечернюю печальные странники в пути
|
||||
|
||||
Сгенерированный английский текст:
|
||||
|
||||
In the to my distress as the monk believes in a shrine as the maiden believes in a miracle as weary travelers believe in the evening star on their journey
|
||||
99
gordeeva_anna_lab_7/laba7.py
Normal file
@@ -0,0 +1,99 @@
|
||||
import docx
|
||||
import streamlit as st
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from tensorflow.keras.models import Sequential
|
||||
from tensorflow.keras.layers import LSTM, Dense, Embedding
|
||||
|
||||
def extract_text_from_docx(file_path):
|
||||
doc = docx.Document(file_path)
|
||||
full_text = []
|
||||
|
||||
for para in doc.paragraphs:
|
||||
full_text.append(para.text)
|
||||
|
||||
return '\n'.join(full_text)
|
||||
|
||||
file_path1 = '"C:/Users/79084/Desktop/textru.doc"'
|
||||
file_path2 = '"C:/Users/79084/Desktop/texten.doc"'
|
||||
|
||||
# Извлечение текста из файла
|
||||
textru = extract_text_from_docx(file_path1)
|
||||
texten = extract_text_from_docx(file_path2)
|
||||
|
||||
# Предобработка текста
|
||||
tokenizer_russian = tf.keras.preprocessing.text.Tokenizer(char_level=True)
|
||||
tokenizer_russian.fit_on_texts(textru)
|
||||
tokenized_text_russian = tokenizer_russian.texts_to_sequences([textru])[0]
|
||||
|
||||
tokenizer_english = tf.keras.preprocessing.text.Tokenizer(char_level=True)
|
||||
tokenizer_english.fit_on_texts(texten)
|
||||
tokenized_text_english = tokenizer_english.texts_to_sequences([texten])[0]
|
||||
|
||||
# Создание последовательных последовательностей для обучения
|
||||
maxlen = 40
|
||||
step = 3
|
||||
sentences_russian = []
|
||||
next_chars_russian = []
|
||||
sentences_english = []
|
||||
next_chars_english = []
|
||||
|
||||
for i in range(0, len(tokenized_text_russian) - maxlen, step):
|
||||
sentences_russian.append(tokenized_text_russian[i: i + maxlen])
|
||||
next_chars_russian.append(tokenized_text_russian[i + maxlen])
|
||||
|
||||
for i in range(0, len(tokenized_text_english) - maxlen, step):
|
||||
sentences_english.append(tokenized_text_english[i: i + maxlen])
|
||||
next_chars_english.append(tokenized_text_english[i + maxlen])
|
||||
|
||||
# Преобразование данных в массивы numpy
|
||||
x_russian = np.array(sentences_russian)
|
||||
y_russian = np.array(next_chars_russian)
|
||||
x_english = np.array(sentences_english)
|
||||
y_english = np.array(next_chars_english)
|
||||
|
||||
# Создание модели для русского текста
|
||||
model_russian = Sequential()
|
||||
model_russian.add(Embedding(len(tokenizer_russian.word_index) + 1, 128))
|
||||
model_russian.add(LSTM(128))
|
||||
model_russian.add(Dense(len(tokenizer_russian.word_index) + 1, activation='softmax'))
|
||||
|
||||
model_russian.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
|
||||
|
||||
# Обучение модели на русском тексте
|
||||
model_russian.fit(x_russian, y_russian, batch_size=128, epochs=50)
|
||||
|
||||
# Создание модели для английского текста
|
||||
model_english = Sequential()
|
||||
model_english.add(Embedding(len(tokenizer_english.word_index) + 1, 128))
|
||||
model_english.add(LSTM(128))
|
||||
model_english.add(Dense(len(tokenizer_english.word_index) + 1, activation='softmax'))
|
||||
|
||||
model_english.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
|
||||
|
||||
# Обучение модели на английском тексте
|
||||
model_english.fit(x_english, y_english, batch_size=128, epochs=50)
|
||||
|
||||
# Функция для генерации текста на основе обученной модели
|
||||
def generate_text(model, tokenizer, seed_text, maxlen, temperature=1.0, num_chars=400):
|
||||
generated_text = seed_text
|
||||
for _ in range(num_chars):
|
||||
encoded = tokenizer.texts_to_sequences([seed_text])[0]
|
||||
encoded = np.array(encoded)
|
||||
predicted_probs = model.predict(encoded, verbose=0)[0]
|
||||
# Используем temperature для более разнообразных предсказаний
|
||||
predicted_probs = np.log(predicted_probs) / temperature
|
||||
exp_preds = np.exp(predicted_probs)
|
||||
predicted_probs = exp_preds / np.sum(exp_preds)
|
||||
predicted = np.random.choice(len(predicted_probs), p=predicted_probs)
|
||||
next_char = tokenizer.index_word.get(predicted, '')
|
||||
generated_text += next_char
|
||||
seed_text += next_char
|
||||
seed_text = seed_text[1:]
|
||||
return generated_text
|
||||
|
||||
generated_russian_text = generate_text(model_russian, tokenizer_russian, 'Ты к моему', maxlen, temperature=0.5, num_chars=400)
|
||||
st.write(generated_russian_text)
|
||||
|
||||
generated_english_text = generate_text(model_english, tokenizer_english, 'In the', maxlen, temperature=0.5, num_chars=400)
|
||||
st.write(generated_english_text)
|
||||
5
gordeeva_anna_lab_7/texten.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
Believe in me, to my distress,
|
||||
As the monk believes in a shrine,
|
||||
As the maiden believes in a miracle,
|
||||
As weary travelers believe
|
||||
In the evening star on their journey.
|
||||
5
gordeeva_anna_lab_7/textru.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
Ты, к моему несчастью, верь,
|
||||
Как в святыню, верит монах,
|
||||
Как в чудо, верит дева,
|
||||
Как верят в вечернюю звезду
|
||||
Печальные странники в пути.
|
||||
BIN
istyukov_timofey_lab1/1_linear_regression.png
Normal file
|
After Width: | Height: | Size: 62 KiB |
BIN
istyukov_timofey_lab1/2_perceptron.png
Normal file
|
After Width: | Height: | Size: 60 KiB |
BIN
istyukov_timofey_lab1/3_poly_ridge.png
Normal file
|
After Width: | Height: | Size: 65 KiB |
61
istyukov_timofey_lab1/README.md
Normal file
@@ -0,0 +1,61 @@
|
||||
# Лабораторная работа №1. Работа с типовыми наборами данных и различными моделями
|
||||
## 12 вариант
|
||||
___
|
||||
|
||||
### Задание:
|
||||
Используя код из пункта «Регуляризация и сеть прямого распространения», сгенерируйте определенный тип данных и сравните на нем 3 модели (по варианту). Постройте графики, отобразите качество моделей, объясните полученные результаты.
|
||||
|
||||
### Данные по варианту:
|
||||
- make_classification (n_samples=500, n_features=2, n_redundant=0, n_informative=2, random_state=rs, n_clusters_per_class=1)
|
||||
|
||||
### Модели по варианту:
|
||||
- Линейная регрессия
|
||||
- Персептрон
|
||||
- Гребневая полиномиальная регрессия (со степенью 4, alpha = 1.0)
|
||||
|
||||
___
|
||||
|
||||
### Запуск
|
||||
- Запустить файл lab1.py
|
||||
|
||||
### Используемые технологии
|
||||
- Язык программирования **Python**
|
||||
- Среда разработки **PyCharm**
|
||||
- Библиотеки:
|
||||
* numpy
|
||||
* sklearn
|
||||
* matplotlib
|
||||
|
||||
### Описание программы
|
||||
Программа генерирует набор данных с помощью функции make_classification()
|
||||
с заданными по варианту параметрами. После этого происходит вывод в консоль
|
||||
качества данных моделей по варианту и построение графикиков для этих моделей.
|
||||
|
||||
Оценка точности происходит при помощи встроенного в модели метода метода
|
||||
**.score()**, который вычисляет правильность модели для набора данных.
|
||||
|
||||
___
|
||||
### Пример работы
|
||||
|
||||

|
||||
```text
|
||||
===> Линейная регрессия <===
|
||||
Оценка точности: 0.4513003751817972
|
||||
```
|
||||
___
|
||||
|
||||

|
||||
```text
|
||||
===> Персептрон <===
|
||||
Оценка точности: 0.7591836734693878
|
||||
```
|
||||
___
|
||||
|
||||

|
||||
```text
|
||||
===> Гребневая полиномиальная регрессия <===
|
||||
Оценка точности: 0.5312017992195672
|
||||
```
|
||||
|
||||
### Вывод
|
||||
Согласно выводу в консоль оценок точности, лучший результат показала модель **персептрона**
|
||||
101
istyukov_timofey_lab1/lab1.py
Normal file
@@ -0,0 +1,101 @@
|
||||
# 12 вариант
|
||||
# Данные: make_classification (n_samples=500, n_features=2, n_redundant=0,
|
||||
# n_informative=2, random_state=rs, n_clusters_per_class=1)
|
||||
# Модели:
|
||||
# -- Линейную регрессию
|
||||
# -- Персептрон
|
||||
# -- Гребневую полиномиальную регрессию (со степенью 4, alpha = 1.0)
|
||||
|
||||
import numpy as np
|
||||
from sklearn.datasets import make_classification
|
||||
from sklearn.linear_model import LinearRegression, Perceptron, Ridge
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.preprocessing import PolynomialFeatures
|
||||
from matplotlib import pyplot as plt
|
||||
from matplotlib.colors import ListedColormap
|
||||
|
||||
|
||||
|
||||
cm_bright_1 = ListedColormap(['#7FFFD4', '#00FFFF'])
|
||||
cm_bright_2 = ListedColormap(['#FF69B4', '#FF1493'])
|
||||
|
||||
def main():
|
||||
X, y = make_classification(
|
||||
n_samples=500,
|
||||
n_features=2,
|
||||
n_redundant=0,
|
||||
n_informative=2,
|
||||
random_state=0,
|
||||
n_clusters_per_class=1)
|
||||
rng = np.random.RandomState(2)
|
||||
X += 2 * rng.uniform(size=X.shape)
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10, random_state=40)
|
||||
|
||||
# модели на основе сгенерированных данных
|
||||
my_linear_regression(X_train, X_test, y_train, y_test)
|
||||
my_perceptron(X_train, X_test, y_train, y_test)
|
||||
my_poly_ridge(X_train, X_test, y_train, y_test)
|
||||
|
||||
|
||||
# Линейная регрессия
|
||||
def my_linear_regression(X_train, X_test, y_train, y_test):
|
||||
lin_reg_model = LinearRegression() # создание модели регрессии
|
||||
lin_reg_model.fit(X_train, y_train) # обучение
|
||||
y_pred = lin_reg_model.predict(X_test) # предсказание по тестовым даннным
|
||||
|
||||
# вывод в консоль
|
||||
print()
|
||||
print('===> Линейная регрессия <===')
|
||||
print('Оценка точности: ', lin_reg_model.score(X_train, y_train))
|
||||
|
||||
# вывод в график
|
||||
plt.title('Линейная регрессия')
|
||||
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright_1)
|
||||
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright_2, alpha=0.8)
|
||||
plt.plot(X_test, y_pred, color='red', linewidth=1)
|
||||
plt.savefig('1_linear_regression.png')
|
||||
plt.show()
|
||||
|
||||
|
||||
# Персептрон
|
||||
def my_perceptron(X_train, X_test, y_train, y_test):
|
||||
perceptron_model = Perceptron()
|
||||
perceptron_model.fit(X_train, y_train)
|
||||
y_pred = perceptron_model.predict(X_test)
|
||||
|
||||
# вывод в консоль
|
||||
print()
|
||||
print('===> Персептрон <===')
|
||||
print('Оценка точности: ', perceptron_model.score(X_train, y_train))
|
||||
|
||||
# вывод в график
|
||||
plt.title('Персептрон')
|
||||
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright_1)
|
||||
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright_2, alpha=0.8)
|
||||
plt.plot(X_test, y_pred, color='red', linewidth=1)
|
||||
plt.savefig('2_perceptron.png')
|
||||
plt.show()
|
||||
|
||||
|
||||
# Гребневая полиномиальная регрессия (степень=4, alpha=1.0)
|
||||
def my_poly_ridge(X_train, X_test, y_train, y_test):
|
||||
poly_rige_model = make_pipeline(PolynomialFeatures(degree=4), Ridge(alpha=1.0))
|
||||
poly_rige_model.fit(X_train, y_train)
|
||||
y_pred = poly_rige_model.predict(X_test)
|
||||
|
||||
# вывод в консоль
|
||||
print()
|
||||
print('===> Гребневая полиномиальная регрессия <===')
|
||||
print('Оценка точности: ', poly_rige_model.score(X_train, y_train))
|
||||
|
||||
# вывод в график
|
||||
plt.title('Гребневая полиномиальная регрессия')
|
||||
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright_1)
|
||||
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright_2, alpha=0.8)
|
||||
plt.plot(X_test, y_pred, color='red', linewidth=1)
|
||||
plt.savefig('3_poly_ridge.png')
|
||||
plt.show()
|
||||
|
||||
|
||||
main()
|
||||
BIN
istyukov_timofey_lab_3/1_dataset.jpg
Normal file
|
After Width: | Height: | Size: 45 KiB |
BIN
istyukov_timofey_lab_3/2_accuracy_score.jpg
Normal file
|
After Width: | Height: | Size: 41 KiB |
BIN
istyukov_timofey_lab_3/3_feature_importances.jpg
Normal file
|
After Width: | Height: | Size: 16 KiB |
73
istyukov_timofey_lab_3/README.md
Normal file
@@ -0,0 +1,73 @@
|
||||
# Лабораторная работа №3. Деревья решений
|
||||
## 12 вариант
|
||||
___
|
||||
|
||||
### Задание:
|
||||
Решите с помощью библиотечной реализации дерева решений задачу из
|
||||
лабораторной работы «Веб-сервис «Дерево решений» по предмету
|
||||
«Методы искусственного интеллекта» на 99% ваших данных.
|
||||
Проверьте работу модели на оставшемся проценте, сделайте вывод.
|
||||
|
||||
|
||||
### Вариант набора данных по курсовой работе:
|
||||
- Прогнозирование музыкальных жанров
|
||||
|
||||
___
|
||||
|
||||
### Запуск
|
||||
- Запустить файл lab3.py
|
||||
|
||||
### Используемые технологии
|
||||
- Язык программирования **Python**
|
||||
- Среда разработки **PyCharm**
|
||||
- Библиотеки:
|
||||
* pandas
|
||||
* sklearn
|
||||
|
||||
### Описание программы
|
||||
**Набор данных (Kaggle):** Полный список жанров, включенных в CSV:
|
||||
«Электронная музыка», «Аниме», «Джаз», «Альтернатива», «Кантри», «Рэп»,
|
||||
«Блюз», «Рок», «Классика», «Хип-хоп».
|
||||
|
||||
**Задача, решаемая деревом решений:** Классификация музыкальных треков на
|
||||
основе их характеристик, таких как темп, инструментальность, акустичность,
|
||||
речевость, танцевальность, энергичность, живость. Дерево решений может
|
||||
предсказывать жанр трека, основываясь на его характеристиках.
|
||||
|
||||
**Задачи оценки:** оценить качество работы модели дерева решений и выявить
|
||||
наиболее значимые признаки набора данных.
|
||||
|
||||
---
|
||||
### Пример работы
|
||||
|
||||
*Датасет, сформированный из случайных строк csv-файла.*
|
||||

|
||||
|
||||
---
|
||||
*Сравнение на оставшихся неиспользованных 0,5% строк датасета
|
||||
предсказнных и действительных жанров.*
|
||||
|
||||

|
||||
|
||||
---
|
||||
*Вычисленнные коэффициенты влияния признаков на прогноз жанра*
|
||||
|
||||

|
||||
|
||||
---
|
||||
|
||||
### Вывод
|
||||
Посредством предобработки датасета дерево решений без проблем обучилось и
|
||||
частично верно предсказало некоторые жанры (в частности, Электро, Классику
|
||||
и Рэп). Также модель показала оценку влиятельности признаков на прогноз
|
||||
жанра. Самым влиятельным признаком оказалась **акустичность** музыкального
|
||||
трека. Менее значимыми оказались речевость (преобладание голосов в треке) и
|
||||
инструментальность (преобладание живых инструментов в треке), что звучит
|
||||
вполне разумно.
|
||||
|
||||
На практике дерево решений по качеству классификации уступает некоторым
|
||||
другим методам. Помимо этого, небольшие изменения в данных могут существенно
|
||||
изменять построенное дерево решений. На примере моего датасета дерево решений
|
||||
справилось не очень успешно. Это можно объяснить тем, что данных в нём
|
||||
недостаточно для предсказания жанра. Но также стоит отметить, что
|
||||
жанр – одно из самых неоднозначных, самых многосоставных музыкальных понятий.
|
||||
69
istyukov_timofey_lab_3/lab3.py
Normal file
@@ -0,0 +1,69 @@
|
||||
"""
|
||||
Решите с помощью библиотечной реализации дерева решений задачу изnлабораторной работы
|
||||
«Веб-сервис «Дерево решений» по предмету «Методы искусственного интеллекта» на 99% ваших данных.
|
||||
Проверьте работу модели на оставшемся проценте, сделайте вывод.
|
||||
"""
|
||||
|
||||
"""
|
||||
Задача, решаемая деревом решений: Классификация музыкальных треков на основе их характеристик,
|
||||
таких как акустика, танцевальность, инструментальность, темп и т.д.
|
||||
Дерево решений может предсказывать жанр трека, основываясь на его характеристиках.
|
||||
"""
|
||||
|
||||
# 12 вариант
|
||||
# Набор данных по курсовой: "Prediction of music genre"
|
||||
|
||||
import pandas as pd
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import accuracy_score
|
||||
|
||||
|
||||
DATASET_FILE = 'music_genre.csv'
|
||||
|
||||
|
||||
def main():
|
||||
df = open_dataset(DATASET_FILE)
|
||||
df = df.sample(frac=.1) # отбираем 10% рандомных строк с набора данных, т.к. он большой
|
||||
print("\033[92m[-----> Набор данных <-----]\033[00m")
|
||||
print(df)
|
||||
|
||||
X = df.drop(columns=['music_genre']) # набор числовых признаков
|
||||
y = df['music_genre'] # набор соответствующих им жанров
|
||||
|
||||
# Разделение датасета на тренировочные (99,5%) и тестовые данные (0,5%)
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.005)
|
||||
|
||||
# Создание и обучение дерева решений
|
||||
model = DecisionTreeClassifier()
|
||||
model.fit(X_train.values, y_train)
|
||||
|
||||
# Прогнозирование жанра на тестовых данных
|
||||
y_pred = model.predict(X_test.values)
|
||||
|
||||
print("\033[92m\n\n\n[-----> Сравнение жанров <-----]\033[00m")
|
||||
df_result = pd.DataFrame({'Прогноз': y_pred, 'Реальность': y_test})
|
||||
print(df_result)
|
||||
|
||||
score = accuracy_score(y_test, y_pred)
|
||||
print("\033[92m\n> Оценка точности модели: {}\033[00m" .format(round(score, 2)))
|
||||
|
||||
print("\033[92m\n\n\n[-----> Оценки важности признаков <-----]\033[00m")
|
||||
df_feature = pd.DataFrame({'Признак': X.columns, "Важность": model.feature_importances_})
|
||||
print(df_feature)
|
||||
|
||||
|
||||
# Функция считывания и очищения csv-файла
|
||||
def open_dataset(csv_file):
|
||||
# открываем файл с указанием знака-отделителя
|
||||
df_genres = pd.read_csv(csv_file, delimiter=',')
|
||||
# выбираем необходимые признаки
|
||||
df_genres = df_genres[['tempo', 'instrumentalness', 'acousticness', 'speechiness', 'danceability', 'energy', 'liveness', 'music_genre']]
|
||||
# очищаем набор данных от пустых и неподходящих значений
|
||||
df_genres = df_genres[df_genres['tempo'] != '?']
|
||||
df_genres = df_genres.dropna()
|
||||
return df_genres
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
50006
istyukov_timofey_lab_3/music_genre.csv
Normal file
BIN
istyukov_timofey_lab_4/1_dendrogram.png
Normal file
|
After Width: | Height: | Size: 37 KiB |
BIN
istyukov_timofey_lab_4/2_dataset.jpg
Normal file
|
After Width: | Height: | Size: 29 KiB |
BIN
istyukov_timofey_lab_4/3_clusters.jpg
Normal file
|
After Width: | Height: | Size: 36 KiB |
78
istyukov_timofey_lab_4/README.md
Normal file
@@ -0,0 +1,78 @@
|
||||
# Лабораторная работа №4. Кластеризация
|
||||
## 12 вариант
|
||||
___
|
||||
|
||||
### Задание:
|
||||
Использовать для своих данных метод кластеризации по варианту,
|
||||
самостоятельно сформулировав задачу. Интерпретировать результаты и оценить,
|
||||
насколько хорошо он подходит для решения сформулированной вами задачи.
|
||||
|
||||
### Вариант:
|
||||
- Алгоритм кластеризации: **linkage**
|
||||
|
||||
### Вариант набора данных по курсовой работе:
|
||||
- Прогнозирование музыкальных жанров ("Prediction of music genre")
|
||||
|
||||
___
|
||||
|
||||
### Запуск
|
||||
- Запустить файл lab4.py
|
||||
|
||||
### Используемые технологии
|
||||
- Язык программирования **Python**
|
||||
- Среда разработки **PyCharm**
|
||||
- Библиотеки:
|
||||
* pandas
|
||||
* scipy
|
||||
* matplotlib
|
||||
|
||||
### Описание программы
|
||||
**Набор данных (Kaggle):** Полный список жанров, включенных в CSV:
|
||||
«Электронная музыка», «Аниме», «Джаз», «Альтернатива», «Кантри», «Рэп»,
|
||||
«Блюз», «Рок», «Классика», «Хип-хоп».
|
||||
|
||||
**Задача, решаемая алгоритмом кластеризации:**
|
||||
Группировка музыкальных треков на основе их характеристик с целью создания
|
||||
кластеров треков с схожими характеристиками. Алгоритм кластеризации может
|
||||
помочь в создании плейлистов и рекомендаций, основанных на схожести
|
||||
музыкальных треков по некоторым характеристикам.
|
||||
|
||||
**Задача оценки:**
|
||||
Анализ получившейся иерархической структуры с помощью дендрограмме.
|
||||
|
||||
---
|
||||
### Пример работы
|
||||
|
||||
*Датасет, сформированный из случайных строк csv-файла.*
|
||||
|
||||

|
||||
|
||||
---
|
||||
*Визуализация дерева, представляющего иерархическое слияние кластеров,
|
||||
в виде дендрограммы. Это может быть полезно для понимания структуры данных.*
|
||||
|
||||

|
||||
|
||||
---
|
||||
*Вывод первых 10 музыльных треков из датасета с их
|
||||
принадлежностью к кластеру*
|
||||
|
||||

|
||||
|
||||
### Вывод
|
||||
С моими данными алгоритм справляется довольно успешно. На результате выше
|
||||
можно сравнить два трека — "Gake No Ue No Ponyo" и "He Would Have Laughed".
|
||||
В результате работы программы они были присвоены к кластеру №10.
|
||||
При этом первый трек отнесён к жанру "Anime", а второй — к "Alternative".
|
||||
Тем не менее, эти две песни похожи преобладанием инструментала в них
|
||||
(в особенности перкуссии), а так же наличием ирландских мотивов в нём.
|
||||
|
||||
В ходе работы было проверено 8 пар музыкальных треков, принадлежащих
|
||||
к разным кластерам. Как итог, больше половины пар действительно имели
|
||||
много схожего в звучании или концепте аранжировки, несмотря на различия
|
||||
по некоторым характеристикам (в том числе жанр).
|
||||
|
||||
Из плюсов иерархической кластеризации можно выделить отсутствие
|
||||
конкретного количества кластеров, для поиска похожей музыки это
|
||||
явно преимущество. Из минусов же — слишком медленная работа
|
||||
на больших наборах данных (из-за чего и было взято 50% от всего датасета).
|
||||
85
istyukov_timofey_lab_4/lab4.py
Normal file
@@ -0,0 +1,85 @@
|
||||
"""
|
||||
Использовать для своих данных метод кластеризации по варианту, самостоятельно сформулировав задачу.
|
||||
Интерпретировать результаты и оценить, насколько хорошо он подходит для решения сформулированной вами задачи.
|
||||
|
||||
"""
|
||||
|
||||
"""
|
||||
Задача, решаемая алгоритмом кластеризации:
|
||||
Группировка музыкальных треков на основе их характеристик с целью создания кластеров треков
|
||||
с схожими характеристиками. Алгоритм кластеризации может помочь в создании плейлистов и рекомендаций,
|
||||
основанных на схожести музыкальных треков по некоторым характеристикам.
|
||||
"""
|
||||
|
||||
# 12 вариант
|
||||
# Набор данных по курсовой: "Prediction of music genre"
|
||||
# Алгоритм кластеризации: linkage
|
||||
|
||||
|
||||
|
||||
import pandas as pd
|
||||
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
|
||||
DATASET_FILE = 'music_genre.csv'
|
||||
|
||||
|
||||
def main():
|
||||
df = open_dataset(DATASET_FILE)
|
||||
df = df.sample(frac=.5) # отбираем 50% рандомных строк с набора данных, т.к. он большой
|
||||
print("\033[92m[-----> Набор данных <-----]\033[00m")
|
||||
print(df)
|
||||
|
||||
# Перевод жанров и ладов (минор/мажор) в числовые признаки
|
||||
df_genres = pd.get_dummies(df['music_genre'])
|
||||
df_modes = pd.get_dummies(df['mode'])
|
||||
# Объединение основной таблицы с числовыми признаками
|
||||
df_music = pd.concat([df, df_genres, df_modes], axis=1).reindex(df.index)
|
||||
# Удаление строковых стоблцов, которые заменили на числовые признаки
|
||||
df_music = df_music.drop(columns=['music_genre', 'mode'])
|
||||
# Датасет для работы с кластеризацией (без исполнителя и названия трека)
|
||||
X = df_music.drop(columns=['artist_name', 'track_name'])
|
||||
|
||||
# Иерархическая кластеризация с связью ward
|
||||
# (минимизация суммы квадратов разностей во всех кластерах)
|
||||
linkage_matrix = linkage(X, method='ward', metric='euclidean')
|
||||
|
||||
# Формирование кластеров из матрицы связей
|
||||
cluster_label = fcluster(Z=linkage_matrix, t=300, criterion='distance')
|
||||
# Присвоение кластера треку
|
||||
df['cluster'] = cluster_label
|
||||
# Установка опции показа 3 столбцов при выводе
|
||||
pd.set_option('display.max_columns', 3)
|
||||
# Вывод результата кластеризации
|
||||
print("\033[92m\nЫ[-----> Результат иерархической кластеризации <-----]\033[00m")
|
||||
print(df[['artist_name', 'track_name', 'cluster']].head(10))
|
||||
print("\033[92mКоличество кластеров: {}\033[00m" .format(cluster_label.max()))
|
||||
|
||||
# Дендрограмма
|
||||
plt.figure(figsize=(12, 6))
|
||||
dendrogram(linkage_matrix, truncate_mode='lastp', p=20, leaf_rotation=90., leaf_font_size=8., show_contracted=True)
|
||||
plt.title('Дендрограмма иерархической кластеризации музыкальных треков')
|
||||
plt.xlabel('Количество треков в узле')
|
||||
plt.ylabel('Евклидово расстояние между треками')
|
||||
plt.savefig('1_dendrogram')
|
||||
plt.show()
|
||||
|
||||
|
||||
# Функция считывания и очищения csv-файла
|
||||
def open_dataset(csv_file):
|
||||
# открываем файл с указанием знака-отделителя
|
||||
df = pd.read_csv(csv_file, delimiter=',')
|
||||
# выбираем необходимые признаки
|
||||
df = df[['artist_name', 'track_name', 'mode', 'tempo', 'instrumentalness', 'acousticness',
|
||||
'speechiness', 'danceability', 'energy', 'liveness', 'valence', 'music_genre']]
|
||||
# очищаем набор данных от пустых и неподходящих значений
|
||||
df = df[df['tempo'] != '?']
|
||||
df = df.dropna()
|
||||
return df
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
50006
istyukov_timofey_lab_4/music_genre.csv
Normal file
91
kochkareva_elizaveta_lab_7/README.md
Normal file
@@ -0,0 +1,91 @@
|
||||
|
||||
# Лабораторная работа 7. Вариант 15
|
||||
|
||||
### Задание
|
||||
Выбрать художественный текст (нечетные варианты – англоязычный) и обучить на нем рекуррентную нейронную сеть
|
||||
для решения задачи генерации. Подобрать архитектуру и параметры так,
|
||||
чтобы приблизиться к максимально осмысленному результату.
|
||||
|
||||
|
||||
### Как запустить лабораторную работу
|
||||
Для запуска программы необходимо с помощью командной строки в корневой директории файлов прокета прописать:
|
||||
```
|
||||
python main.py
|
||||
```
|
||||
### Какие технологии использовали
|
||||
- Библиотека *numpy* для работы с массивами.
|
||||
- Библиотека *tensorflow* - для машинного обучения. Она предоставляет инструменты для создания и обучения различных моделей машинного обучения, включая нейронные сети.
|
||||
|
||||
### Описание лабораторной работы
|
||||
Для данной лабораторной работы был взят текст на 1596 строк текста.
|
||||
|
||||
```python
|
||||
with open('V3001TH2.txt', 'r', encoding='utf-8') as f:
|
||||
text = f.read()
|
||||
```
|
||||
|
||||
Далее создали список уникальных символов `chars`, а также словари `char_to_index` и `index_to_char`, которые используются для преобразования символов в индексы и наоборот.
|
||||
|
||||
```python
|
||||
chars = sorted(list(set(text)))
|
||||
char_to_index = {char: index for index, char in enumerate(chars)}
|
||||
index_to_char = {index: char for index, char in enumerate(chars)}
|
||||
```
|
||||
|
||||
После чего можем генерировать ренировочные данные `train_x` и `train_y`. `train_x` содержит последовательности символов длиной `seq_length` из текста, а `train_y` содержит следующий символ после каждой входной последовательности. Каждый символ преобразуется в соответствующий индекс, используя словарь `char_to_index`.
|
||||
|
||||
```python
|
||||
# Генерация тренировочных данных
|
||||
seq_length = 100 # Длина входной последовательности
|
||||
train_x = []
|
||||
train_y = []
|
||||
for i in range(0, text_length - seq_length, 1):
|
||||
input_seq = text[i:i + seq_length]
|
||||
output_seq = text[i + seq_length]
|
||||
train_x.append([char_to_index[char] for char in input_seq])
|
||||
train_y.append(char_to_index[output_seq])
|
||||
```
|
||||
|
||||
Далее преобразуем `train_x` в трехмерный массив с размерностью (количество примеров, `seq_length`, 1).
|
||||
Нормализуем значения `train_x` путем деления на `num_chars` и преобразуем `train_y` в `one-hot` представление с помощью `tf.keras.utils.to_categorical.`
|
||||
|
||||
```python
|
||||
train_x = np.reshape(train_x, (len(train_x), seq_length, 1))
|
||||
train_x = train_x / float(num_chars)
|
||||
train_y = tf.keras.utils.to_categorical(train_y)
|
||||
```
|
||||
Теперь переходим к созданию модели рекуррентной нейронной сети с `LSTM` слоем, принимающим входные данные размерности `(train_x.shape[1], train_x.shape[2])` и плотным слоем с активацией softmax.
|
||||
Компилируем модель с функцией потерь `categorical_crossentropy` и оптимизатором `adam`.
|
||||
|
||||
```python
|
||||
model = tf.keras.Sequential([
|
||||
tf.keras.layers.LSTM(128, input_shape=(train_x.shape[1], train_x.shape[2])),
|
||||
tf.keras.layers.Dense(num_chars, activation='softmax')
|
||||
])
|
||||
|
||||
model.compile(loss='categorical_crossentropy', optimizer='adam')
|
||||
```
|
||||
|
||||
Обучаем модель на тренировачных данных с заданным количеством эпох - 80 и размером пакета - 128.
|
||||
```
|
||||
model.fit(train_x, train_y, epochs=80, batch_size=128)
|
||||
```
|
||||
|
||||
Генерируется текст, начиная с случайного индекса `start_index` в `train_x`. Затем, на каждой итерации цикла, модель предсказывает следующий символ, добавляет его к сгенерированному тексту и обновляет `start_seq` для использования в следующей итерации.
|
||||
Записывает сгенерированный текст в файл *'сгенерированный_текст.txt'*.
|
||||
|
||||
Результат выполнения:
|
||||
|
||||
```
|
||||
Ih ses shven they to tore a fit oo th toie th sook a buck and tore tote a siee fot oo the searen.
|
||||
Jnd buonds sore toee th the shele and thans to the siee and soans tie his and tooning tie hit cnd toens the his and croninng his bioter.
|
||||
|
||||
|
||||
— Iod you ducking tooeeds so toieg a buck and to bor aeeut tore a sigee oo toire a ducn fo toine to see sooeee oo the saelen. Tnd blond toees the sirt and that the sooel and thai to the soeee of the shale.
|
||||
|
||||
|
||||
"Iotk toe ffcrtes," Vincent says suth a suine and a
|
||||
```
|
||||
### Вывод
|
||||
|
||||
Текст содержит некоторые слова и фразы, которые кажутся некорректными или непонятными. Это может быть связано с недостаточным количеством обучающих данных или эпох обучения.
|
||||
1597
kochkareva_elizaveta_lab_7/V3001TH2.txt
Normal file
64
kochkareva_elizaveta_lab_7/main.py
Normal file
@@ -0,0 +1,64 @@
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
def recurrent_neural_network():
|
||||
# Загрузка текстового файла и предварительная обработка данных
|
||||
with open('V3001TH2.txt', 'r', encoding='utf-8') as f:
|
||||
text = f.read()
|
||||
|
||||
chars = sorted(list(set(text)))
|
||||
char_to_index = {char: index for index, char in enumerate(chars)}
|
||||
index_to_char = {index: char for index, char in enumerate(chars)}
|
||||
|
||||
num_chars = len(chars)
|
||||
text_length = len(text)
|
||||
|
||||
# Генерация тренировочных данных
|
||||
seq_length = 100 # Длина входной последовательности
|
||||
train_x = []
|
||||
train_y = []
|
||||
for i in range(0, text_length - seq_length, 1):
|
||||
input_seq = text[i:i + seq_length]
|
||||
output_seq = text[i + seq_length]
|
||||
train_x.append([char_to_index[char] for char in input_seq])
|
||||
train_y.append(char_to_index[output_seq])
|
||||
|
||||
train_x = np.reshape(train_x, (len(train_x), seq_length, 1))
|
||||
train_x = train_x / float(num_chars)
|
||||
train_y = tf.keras.utils.to_categorical(train_y)
|
||||
|
||||
model = tf.keras.Sequential([
|
||||
tf.keras.layers.LSTM(128, input_shape=(train_x.shape[1], train_x.shape[2])),
|
||||
tf.keras.layers.Dense(num_chars, activation='softmax')
|
||||
])
|
||||
|
||||
model.compile(loss='categorical_crossentropy', optimizer='adam')
|
||||
|
||||
# Обучение модели
|
||||
model.fit(train_x, train_y, epochs=80, batch_size=128)
|
||||
|
||||
# Генерация текста
|
||||
start_index = np.random.randint(0, len(train_x) - 1)
|
||||
start_seq = train_x[start_index]
|
||||
|
||||
generated_text = ''
|
||||
for _ in range(500):
|
||||
x = np.reshape(start_seq, (1, len(start_seq), 1))
|
||||
x = x / float(num_chars)
|
||||
|
||||
prediction = model.predict(x, verbose=0)
|
||||
index = np.argmax(prediction)
|
||||
result = index_to_char[index]
|
||||
|
||||
generated_text += result
|
||||
start_seq = np.append(start_seq, index)
|
||||
start_seq = start_seq[1:]
|
||||
|
||||
with open('сгенерированный_текст.txt', 'w', encoding='utf-8') as f:
|
||||
f.write(generated_text)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
recurrent_neural_network()
|
||||
|
||||
8
kochkareva_elizaveta_lab_7/сгенерированный_текст.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
Ih ses shven they to tore a fit oo th toie th sook a buck and tore tote a siee fot oo the searen.
|
||||
Jnd buonds sore toee th the shele and thans to the siee and soans tie his and tooning tie hit cnd toens the his and croninng his bioter.
|
||||
|
||||
|
||||
— Iod you ducking tooeeds so toieg a buck and to bor aeeut tore a sigee oo toire a ducn fo toine to see sooeee oo the saelen. Tnd blond toees the sirt and that the sooel and thai to the soeee of the shale.
|
||||
|
||||
|
||||
"Iotk toe ffcrtes," Vincent says suth a suine and a
|
||||
63
kozlov_alexey_lab_2/README.md
Normal file
@@ -0,0 +1,63 @@
|
||||
# Лабораторная работа №2. Ранжирование признаков
|
||||
## 14 вариант
|
||||
___
|
||||
|
||||
### Задание:
|
||||
Используя код из [1](пункт «Решение задачи ранжирования признаков», стр. 205), выполните ранжирование признаков с помощью указанных по варианту моделей. Отобразите получившиеся значения\оценки каждого признака каждым методом\моделью и среднюю оценку. Проведите анализ получившихся результатов. Какие четыре признака оказались самыми важными по среднему значению? (Названия\индексы признаков и будут ответом на задание).
|
||||
|
||||
### Модели по варианту:
|
||||
- Случайное Лассо (RandomizedLasso)
|
||||
- Сокращение признаков cлучайными деревьями (Random Forest Regressor)
|
||||
- Линейная корреляция (f_regression)
|
||||
|
||||
___
|
||||
|
||||
### Запуск
|
||||
- Запустить файл lab2.py
|
||||
|
||||
### Используемые технологии
|
||||
- Язык программирования **Python**
|
||||
- Среда разработки **PyCharm**
|
||||
- Библиотеки:
|
||||
* sklearn
|
||||
* matplotlib
|
||||
* numpy
|
||||
|
||||
### Описание программы
|
||||
1. Импортирует необходимые модули и классы:
|
||||
- RandomForestRegressor из sklearn.ensemble для создания модели случайного леса регрессии;
|
||||
- RandomizedLasso из RandomizedLasso для создания модели случайного Лассо (метода регуляризации линейной регрессии);
|
||||
- f_regression из sklearn.feature_selection для выполнения линейной корреляции между признаками и целевой переменной;
|
||||
- MinMaxScaler из sklearn.preprocessing для масштабирования оценок признаков к диапазону [0, 1];
|
||||
- numpy для работы с массивами данных.
|
||||
|
||||
2. Определяет функцию generation_data, которая генерирует случайные данные для обучения модели. Для простоты, будут использованы заранее определенные случайные значения.
|
||||
|
||||
3. Определяет функцию rank_to_dict, которая принимает ранговые оценки признаков и преобразует их в словарь с нормализованными значениями от 0 до 1.
|
||||
|
||||
4. Определяет функцию get_estimation, которая вычисляет среднюю оценку по всем моделям и выводит отсортированный список признаков по убыванию оценки.
|
||||
|
||||
5. Определяет функцию print_sorted_data, которая выводит отсортированные оценки признаков для каждой модели.
|
||||
|
||||
6. Определяет функцию main, которая объединяет все шаги: генерацию данных, обучение моделей, расчет оценок признаков и вывод результатов.
|
||||
|
||||
7. Вызывает функцию main для выполнения программы.
|
||||
|
||||
___
|
||||
### Пример работы
|
||||
|
||||

|
||||
|
||||
|
||||
### Вывод
|
||||
На основе результатов можно сделать следующие выводы:
|
||||
|
||||
1. Признаки x4, x2, x14 и x1 являются самыми важными. Их средние оценки по всем моделям составляют 0.82, 0.8, 0.66 и 0.56 соответственно.
|
||||
|
||||
2. В модели случайного леса регрессии наиболее значимыми признаками являются x14, x2, x4 и x1. Они имеют оценки 1.0, 0.84, 0.77 и 0.74 соответственно.
|
||||
|
||||
3. По результатам линейной корреляции (f-регрессия), самыми важными признаками также являются x4, x14, x2 и x12 с оценками 1.0, 0.97, 0.57 и 0.56 соответственно.
|
||||
|
||||
4. В модели случайного Лассо наиболее значимыми признаками являются x2, x4, x1 и x5. Их оценки составляют 1.0, 0.69, 0.49 и 0.44 соответственно.
|
||||
|
||||
Таким образом, можно сделать вывод, что признаки x4, x2, x14 и x1 являются наиболее значимыми для всех моделей.
|
||||
76
kozlov_alexey_lab_2/RandomizedLasso.py
Normal file
@@ -0,0 +1,76 @@
|
||||
from sklearn.utils import check_X_y, check_random_state
|
||||
from sklearn.linear_model import Lasso
|
||||
from scipy.sparse import issparse
|
||||
from scipy import sparse
|
||||
|
||||
|
||||
def _rescale_data(x, weights):
|
||||
if issparse(x):
|
||||
size = weights.shape[0]
|
||||
weight_dia = sparse.dia_matrix((1 - weights, 0), (size, size))
|
||||
x_rescaled = x * weight_dia
|
||||
else:
|
||||
x_rescaled = x * (1 - weights)
|
||||
|
||||
return x_rescaled
|
||||
|
||||
|
||||
class RandomizedLasso(Lasso):
|
||||
"""
|
||||
Randomized version of scikit-learns Lasso class.
|
||||
|
||||
Randomized LASSO is a generalization of the LASSO. The LASSO penalises
|
||||
the absolute value of the coefficients with a penalty term proportional
|
||||
to `alpha`, but the randomized LASSO changes the penalty to a randomly
|
||||
chosen value in the range `[alpha, alpha/weakness]`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
weakness : float
|
||||
Weakness value for randomized LASSO. Must be in (0, 1].
|
||||
|
||||
See also
|
||||
--------
|
||||
sklearn.linear_model.LogisticRegression : learns logistic regression models
|
||||
using the same algorithm.
|
||||
"""
|
||||
def __init__(self, weakness=0.5, alpha=1.0, fit_intercept=True,
|
||||
precompute=False, copy_X=True, max_iter=1000,
|
||||
tol=1e-4, warm_start=False, positive=False,
|
||||
random_state=None, selection='cyclic'):
|
||||
self.weakness = weakness
|
||||
super(RandomizedLasso, self).__init__(
|
||||
alpha=alpha, fit_intercept=fit_intercept, precompute=precompute, copy_X=copy_X,
|
||||
max_iter=max_iter, tol=tol, warm_start=warm_start,
|
||||
positive=positive, random_state=random_state,
|
||||
selection=selection)
|
||||
|
||||
def fit(self, X, y):
|
||||
"""Fit the model according to the given training data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
|
||||
The training input samples.
|
||||
|
||||
y : array-like, shape = [n_samples]
|
||||
The target values.
|
||||
"""
|
||||
if not isinstance(self.weakness, float) or not (0.0 < self.weakness <= 1.0):
|
||||
raise ValueError('weakness should be a float in (0, 1], got %s' % self.weakness)
|
||||
|
||||
X, y = check_X_y(X, y, accept_sparse=True)
|
||||
|
||||
n_features = X.shape[1]
|
||||
weakness = 1. - self.weakness
|
||||
random_state = check_random_state(self.random_state)
|
||||
|
||||
weights = weakness * random_state.randint(0, 1 + 1, size=(n_features,))
|
||||
|
||||
# TODO: I am afraid this will do double normalization if set to true
|
||||
#X, y, _, _ = _preprocess_data(X, y, self.fit_intercept, normalize=self.normalize, copy=False,
|
||||
# sample_weight=None, return_mean=False)
|
||||
|
||||
# TODO: Check if this is a problem if it happens before standardization
|
||||
X_rescaled = _rescale_data(X, weights)
|
||||
return super(RandomizedLasso, self).fit(X_rescaled, y)
|
||||
62
kozlov_alexey_lab_2/lab2.py
Normal file
@@ -0,0 +1,62 @@
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
from RandomizedLasso import RandomizedLasso
|
||||
from sklearn.feature_selection import f_regression
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
import numpy as np
|
||||
|
||||
names = ["x%s" % i for i in range(1, 15)]
|
||||
def main():
|
||||
x,y = generation_data()
|
||||
# Сокращение признаков cлучайными деревьями (Random Forest Regressor)
|
||||
rfr = RandomForestRegressor()
|
||||
rfr.fit(x, y)
|
||||
# Модель линейной корреляции
|
||||
f, _ = f_regression(x, y, center=False)
|
||||
# Случайное Лассо
|
||||
randomized_lasso = RandomizedLasso(alpha=.01)
|
||||
randomized_lasso.fit(x, y)
|
||||
|
||||
ranks = {"Random Forest Regressor": rank_to_dict(rfr.feature_importances_), 'f-Regression': rank_to_dict(f), "Randomize Lasso": rank_to_dict(randomized_lasso.coef_)}
|
||||
|
||||
get_estimation(ranks)
|
||||
print_sorted_data(ranks)
|
||||
def generation_data():
|
||||
np.random.seed(0)
|
||||
size = 750
|
||||
X = np.random.uniform(0, 1, (size, 14))
|
||||
Y = (10 * np.sin(np.pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - .5) ** 2 +
|
||||
10 * X[:, 3] + 5 * X[:, 4] ** 5 + np.random.normal(0, 1))
|
||||
X[:, 10:] = X[:, :4] + np.random.normal(0, .025, (size, 4))
|
||||
return X, Y
|
||||
def rank_to_dict(ranks):
|
||||
ranks = np.abs(ranks)
|
||||
minmax = MinMaxScaler()
|
||||
ranks = minmax.fit_transform(np.array(ranks).reshape(14, 1)).ravel()
|
||||
ranks = map(lambda x: round(x, 2), ranks)
|
||||
return dict(zip(names, ranks))
|
||||
def get_estimation(ranks: {}):
|
||||
mean = {}
|
||||
for key, value in ranks.items():
|
||||
for item in value.items():
|
||||
if(item[0] not in mean):
|
||||
mean[item[0]] = 0
|
||||
mean[item[0]] += item[1]
|
||||
for key, value in mean.items():
|
||||
res = value/len(ranks)
|
||||
mean[key] = round(res, 2)
|
||||
mean_sorted = sorted(mean.items(), key=lambda item: item[1], reverse=True)
|
||||
print("Средние значения")
|
||||
print(mean_sorted)
|
||||
print("4 самых важных признака по среднему значению")
|
||||
for item in mean_sorted[:4]:
|
||||
print('{0} - {1}'.format(item[0], item[1]))
|
||||
def print_sorted_data(ranks: {}):
|
||||
print()
|
||||
for key, value in ranks.items():
|
||||
ranks[key] = sorted(value.items(), key=lambda item: item[1], reverse=True)
|
||||
for key, value in ranks.items():
|
||||
print(key)
|
||||
print(value)
|
||||
|
||||
|
||||
main()
|
||||
BIN
kozlov_alexey_lab_2/results.png
Normal file
|
After Width: | Height: | Size: 32 KiB |
BIN
romanova_adelina_lab_3/1.png
Normal file
|
After Width: | Height: | Size: 76 KiB |
BIN
romanova_adelina_lab_3/2.png
Normal file
|
After Width: | Height: | Size: 184 KiB |
BIN
romanova_adelina_lab_3/3.png
Normal file
|
After Width: | Height: | Size: 25 KiB |
BIN
romanova_adelina_lab_3/4.png
Normal file
|
After Width: | Height: | Size: 43 KiB |
77
romanova_adelina_lab_3/README.md
Normal file
@@ -0,0 +1,77 @@
|
||||
# Лабораторная работа №3. Вариант 21
|
||||
|
||||
## Тема:
|
||||
Деревья решений
|
||||
|
||||
## Модель:
|
||||
|
||||
Decision Tree Classifier
|
||||
|
||||
## Как запустить программу:
|
||||
Установить *python, numpy, matplotlib, sklearn*
|
||||
```
|
||||
python main.py
|
||||
```
|
||||
|
||||
## Какие технологии использовались:
|
||||
Язык программирования Python, библиотеки numpy, matplotlib, sklearn
|
||||
|
||||
Среда разработки VSCode
|
||||
|
||||
# Что делает лабораторная работа:
|
||||
Использует данные из набора "UCI Heart Disease Data" и обучает модель: ```Decision Tree Classifier```
|
||||
|
||||
Датасет UCI Heart Disease Data содержит информацию о различных клинических признаках, таких как возраст, пол, артериальное давление, холестерин, наличие электрокардиографических признаков и другие, а также целевую переменную, отражающую наличие или отсутствие заболевания сердца.
|
||||
|
||||
Для начала нужно предобработать данные, чтобы модель могла принимать их на вход. Изначально данный имеют следующий вид:
|
||||
|
||||

|
||||
|
||||
Так как модели машинного обучения умеют работать исключительно с числовыми значениями, то нужно свести все данных к данному формату и использовать только полные строки, значение признаков которых не являются пустыми значениями. Это происходит с использованием функции, представленной ниже:
|
||||
|
||||

|
||||
|
||||
Далее нужно привести целевое значение к бинарному виду, т.к изначально данное поле принимает 4 значения. После этого применить подход, называемый “feature engineering”, для получения большего количества признаков, которые возможно помогут модели при решении задачи, т.к. обычно в машинном и глубоком обучении действует следующая логика: Больше данных - лучше результат. Получение новых признаков происходит с помощью функции ниже и далее обновленный набор данных снова преобразовывается к численному формату.
|
||||
|
||||
```
|
||||
def fe_creation(df):
|
||||
# Feature engineering (FE)
|
||||
df['age2'] = df['age']//10
|
||||
df['trestbps2'] = df['trestbps']//10
|
||||
df['chol2'] = df['chol']//60
|
||||
df['thalch2'] = df['thalch']//40
|
||||
df['oldpeak2'] = df['oldpeak']//0.4
|
||||
for i in ['sex', 'age2', 'fbs', 'restecg', 'exang']:
|
||||
for j in ['cp','trestbps2', 'chol2', 'thalch2', 'oldpeak2', 'slope']:
|
||||
df[i + "_" + j] = df[i].astype('str') + "_" + df[j].astype('str')
|
||||
return df
|
||||
```
|
||||
После применения данной функции количество признаков увеличилось с 12 до 47. Далее все признаки стандартизируются с помощью следующей формулы z = (x-mean)/std, где х - текущее значение признак, mean - математическое ожидание столбца с этим признаком, std - стандартное отклонение данного признака, а z - соответственно новое значение признака x. После всех описанных действий данные стали готовыми для их использования для обучения деревьев.
|
||||
|
||||
```Decision Tree Classifier```- это алгоритм машинного обучения, который использует структуру дерева для принятия решений. Каждый узел дерева представляет собой тест по какому-то признаку, а каждая ветвь представляет возможный результат этого теста. Цель - разделить данные на подгруппы так, чтобы в каждой подгруппе преобладал один класс.
|
||||
|
||||
```
|
||||
decision_tree = DecisionTreeClassifier()
|
||||
param_grid = {'min_samples_leaf': [i for i in range(2,12)]}
|
||||
decision_tree_CV = GridSearchCV(decision_tree, param_grid=param_grid, cv=cv_train, verbose=False)
|
||||
decision_tree_CV.fit(train, train_target)
|
||||
print(decision_tree_CV.best_params_)
|
||||
|
||||
acc_all = acc_metrics_calc(0, acc_all, decision_tree_CV, train, valid, train_target, valid_target, title="Decision Tree Classifier")
|
||||
plot_learning_curve(decision_tree_CV, "Decision Tree", train, train_target, cv=cv_train)
|
||||
|
||||
feature_importances_dt = decision_tree_CV.best_estimator_.feature_importances_
|
||||
plot_feature_importance(feature_importances_dt, data.columns, "Decision Tree")
|
||||
```
|
||||
|
||||
Первым был обучен Decision Tree Classifier, который с помощью алгоритма GridSearch нашел наилучшие гиперпараметры для решения задачи. Ниже приведены графики, отображающие качество и процесс обучения данного классификатора.
|
||||
|
||||

|
||||
|
||||
На следующем графике мы можем увидеть какие признаки модель посчитала наиболее важными:
|
||||
|
||||

|
||||
|
||||
## Вывод
|
||||
|
||||
На обучающихся данных мы в большинстве случаев предсказываем правильно, а в валидационных появляется проблема с выявлением второго класса, которое отображает наличие заболеваний.
|
||||
302
romanova_adelina_lab_3/main.py
Normal file
@@ -0,0 +1,302 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
|
||||
import sklearn
|
||||
from sklearn.preprocessing import (LabelEncoder,
|
||||
StandardScaler,
|
||||
MinMaxScaler,
|
||||
RobustScaler)
|
||||
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, learning_curve, ShuffleSplit
|
||||
from sklearn.model_selection import cross_val_predict as cvp
|
||||
from sklearn import metrics
|
||||
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, confusion_matrix, explained_variance_score
|
||||
|
||||
from sklearn.tree import DecisionTreeClassifier, plot_tree
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
|
||||
|
||||
def str_features_to_numeric(data):
|
||||
# Преобразовывает все строковые признаки в числовые.
|
||||
|
||||
# Определение категориальных признаков
|
||||
categorical_columns = []
|
||||
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
|
||||
features = data.columns.values.tolist()
|
||||
for col in features:
|
||||
if data[col].dtype in numerics: continue
|
||||
categorical_columns.append(col)
|
||||
|
||||
# Кодирование категориальных признаков
|
||||
for col in categorical_columns:
|
||||
if col in data.columns:
|
||||
le = LabelEncoder()
|
||||
le.fit(list(data[col].astype(str).values))
|
||||
data[col] = le.transform(list(data[col].astype(str).values))
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def fe_creation(df):
|
||||
# Feature engineering (FE)
|
||||
df['age2'] = df['age']//10
|
||||
df['trestbps2'] = df['trestbps']//10
|
||||
df['chol2'] = df['chol']//60
|
||||
df['thalch2'] = df['thalch']//40
|
||||
df['oldpeak2'] = df['oldpeak']//0.4
|
||||
for i in ['sex', 'age2', 'fbs', 'restecg', 'exang']:
|
||||
for j in ['cp','trestbps2', 'chol2', 'thalch2', 'oldpeak2', 'slope']:
|
||||
df[i + "_" + j] = df[i].astype('str') + "_" + df[j].astype('str')
|
||||
return df
|
||||
|
||||
|
||||
def acc_d(y_meas, y_pred):
|
||||
# Относительная погрешность между прогнозируемыми значениями y_pred и измеренными значениями y_meas
|
||||
return mean_absolute_error(y_meas, y_pred)*len(y_meas)/sum(abs(y_meas))
|
||||
|
||||
|
||||
def acc_rmse(y_meas, y_pred):
|
||||
# Среднеквадратичная ошибка между прогнозируемыми значениями y_pred и измеренными значениями y_meas
|
||||
return (mean_squared_error(y_meas, y_pred))**0.5
|
||||
|
||||
|
||||
def plot_cm(train_target, train_target_pred, valid_target, valid_target_pred, title):
|
||||
# Построение матриц ошибок
|
||||
|
||||
def cm_calc(y_true, y_pred):
|
||||
cm = confusion_matrix(y_true, y_pred, labels=np.unique(y_true))
|
||||
cm_sum = np.sum(cm, axis=1, keepdims=True)
|
||||
cm_perc = cm / cm_sum.astype(float) * 100
|
||||
annot = np.empty_like(cm).astype(str)
|
||||
nrows, ncols = cm.shape
|
||||
for i in range(nrows):
|
||||
for j in range(ncols):
|
||||
c = cm[i, j]
|
||||
p = cm_perc[i, j]
|
||||
if i == j:
|
||||
s = cm_sum[i]
|
||||
annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
|
||||
elif c == 0:
|
||||
annot[i, j] = ''
|
||||
else:
|
||||
annot[i, j] = '%.1f%%\n%d' % (p, c)
|
||||
cm = pd.DataFrame(cm, index=np.unique(y_true), columns=np.unique(y_true))
|
||||
cm.index.name = 'Actual'
|
||||
cm.columns.name = 'Predicted'
|
||||
return cm, annot
|
||||
|
||||
|
||||
# Построение матриц ошибок
|
||||
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(16, 6), sharex=True)
|
||||
|
||||
# Обучающие данные
|
||||
ax = axes[0]
|
||||
ax.set_title("for training data")
|
||||
cm0, annot0 = cm_calc(train_target, train_target_pred)
|
||||
sns.heatmap(cm0, cmap= "YlGnBu", annot=annot0, fmt='', ax=ax)
|
||||
|
||||
# Тестовые данные
|
||||
ax = axes[1]
|
||||
ax.set_title("for test (validation) data")
|
||||
cm1, annot1 = cm_calc(valid_target, valid_target_pred)
|
||||
sns.heatmap(cm1, cmap= "YlGnBu", annot=annot1, fmt='', ax=ax)
|
||||
|
||||
fig.suptitle(f'CONFUSION MATRICES for {title}')
|
||||
plt.savefig(f'CONFUSION MATRICES for {title}.png')
|
||||
plt.show()
|
||||
|
||||
|
||||
def acc_metrics_calc(num, acc_all, model, train, valid, train_target, valid_target, title):
|
||||
# Этап выбора моделей
|
||||
# Расчет точности модели по различным показателям
|
||||
|
||||
ytrain = model.predict(train).astype(int)
|
||||
yvalid = model.predict(valid).astype(int)
|
||||
print('train_target = ', train_target[:5].values)
|
||||
print('ytrain = ', ytrain[:5])
|
||||
print('valid_target =', valid_target[:5].values)
|
||||
print('yvalid =', yvalid[:5])
|
||||
|
||||
num_acc = 0
|
||||
for x in metrics_now:
|
||||
if x == 1:
|
||||
#критерий точности score
|
||||
acc_train = round(metrics.accuracy_score(train_target, ytrain), 2)
|
||||
acc_valid = round(metrics.accuracy_score(valid_target, yvalid), 2)
|
||||
elif x == 2:
|
||||
# rmse критерий
|
||||
acc_train = round(acc_rmse(train_target, ytrain), 2)
|
||||
acc_valid = round(acc_rmse(valid_target, yvalid), 2)
|
||||
elif x == 3:
|
||||
# критерий относительной погрешности
|
||||
acc_train = round(acc_d(train_target, ytrain) * 100, 2)
|
||||
acc_valid = round(acc_d(valid_target, yvalid) * 100, 2)
|
||||
|
||||
print('acc of', metrics_all[x], 'for train =', acc_train)
|
||||
print('acc of', metrics_all[x], 'for valid =', acc_valid)
|
||||
acc_all[num_acc].append(acc_train) #train
|
||||
acc_all[num_acc+1].append(acc_valid) #valid
|
||||
num_acc += 2
|
||||
|
||||
# Построение матриц
|
||||
plot_cm(train_target, ytrain, valid_target, yvalid, title)
|
||||
|
||||
return acc_all
|
||||
|
||||
|
||||
def plot_feature_importance(feature_importances, feature_names, model_name):
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
|
||||
# Создание цветовой палитры
|
||||
colors = sns.color_palette('viridis', len(feature_importances))
|
||||
|
||||
# Сортировка индексов важностей признаков
|
||||
indices = feature_importances.argsort()[::-1]
|
||||
|
||||
# Создание стильного барплота
|
||||
plt.figure(figsize=(12, 8))
|
||||
ax = sns.barplot(x=feature_importances[indices], y=feature_names[indices], palette=colors)
|
||||
|
||||
# Добавление декораций
|
||||
plt.xlabel('Важность признака', fontsize=14)
|
||||
plt.ylabel('Признаки', fontsize=14)
|
||||
plt.title(f'Важность признаков в модели {model_name}', fontsize=16)
|
||||
plt.xticks(fontsize=12)
|
||||
plt.yticks(fontsize=12)
|
||||
|
||||
# Добавление цветовой шкалы и ее описания
|
||||
cbar = plt.colorbar(plt.cm.ScalarMappable(cmap='viridis'), ax=ax)
|
||||
cbar.set_label('Уровень важности', rotation=270, labelpad=15, fontsize=12)
|
||||
|
||||
# Добавление сетки для лучшей читаемости
|
||||
plt.grid(axis='x', linestyle='--', alpha=0.6)
|
||||
|
||||
# Сохранение графика в файл
|
||||
plt.savefig('feature_importance_plot.png', bbox_inches='tight')
|
||||
|
||||
# Отображение графика
|
||||
plt.savefig(f'feature_importances_{model_name}.png')
|
||||
plt.show()
|
||||
|
||||
|
||||
def plot_learning_curve(estimator, title, X, y, cv=None, axes=None, ylim=None,
|
||||
n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5), random_state=0):
|
||||
fig, axes = plt.subplots(2, 1, figsize=(20, 10))
|
||||
|
||||
if axes is None:
|
||||
_, axes = plt.subplots(1, 2, figsize=(20, 5))
|
||||
|
||||
axes[0].set_title(title)
|
||||
if ylim is not None:
|
||||
axes[0].set_ylim(*ylim)
|
||||
axes[0].set_xlabel("Training examples")
|
||||
axes[0].set_ylabel("Score")
|
||||
|
||||
cv_train = ShuffleSplit(n_splits=cv_n_split, test_size=test_train_split_part, random_state=random_state)
|
||||
|
||||
train_sizes, train_scores, test_scores, fit_times, _ = \
|
||||
learning_curve(estimator=estimator, X=X, y=y, cv=cv,
|
||||
train_sizes=train_sizes,
|
||||
return_times=True)
|
||||
|
||||
train_scores_mean = np.mean(train_scores, axis=1)
|
||||
train_scores_std = np.std(train_scores, axis=1)
|
||||
test_scores_mean = np.mean(test_scores, axis=1)
|
||||
test_scores_std = np.std(test_scores, axis=1)
|
||||
fit_times_mean = np.mean(fit_times, axis=1)
|
||||
fit_times_std = np.std(fit_times, axis=1)
|
||||
|
||||
# Plot learning curve
|
||||
axes[0].grid()
|
||||
axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
|
||||
train_scores_mean + train_scores_std, alpha=0.1,
|
||||
color="r")
|
||||
axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
|
||||
test_scores_mean + test_scores_std, alpha=0.1,
|
||||
color="g")
|
||||
axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
|
||||
label="Training score")
|
||||
axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
|
||||
label="Cross-validation score")
|
||||
axes[0].legend(loc="best")
|
||||
|
||||
# Plot n_samples vs fit_times
|
||||
axes[1].grid()
|
||||
axes[1].plot(train_sizes, fit_times_mean, 'o-')
|
||||
axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
|
||||
fit_times_mean + fit_times_std, alpha=0.1)
|
||||
axes[1].set_xlabel("Training examples")
|
||||
axes[1].set_ylabel("fit_times")
|
||||
axes[1].set_title("Scalability of the model")
|
||||
|
||||
plt.savefig(f'{title}.png')
|
||||
|
||||
plt.show()
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Загрузка данных
|
||||
# Преобразование данных и предобработка
|
||||
# Обучение моделей Decision Tree Classifier и Random Forest Classifier
|
||||
# Расчет метрик и построение графиков
|
||||
cv_n_split = 5
|
||||
random_state = 42
|
||||
test_train_split_part = 0.25
|
||||
|
||||
metrics_all = {1: 'acc', 2 : 'rmse', 3 : 're'}
|
||||
metrics_now = [1, 2, 3]
|
||||
|
||||
data = pd.read_csv("..//heart_disease_uci.csv")
|
||||
data['target'] = data['num']
|
||||
data = data.drop(columns=['id', 'dataset', 'ca', 'thal', 'num'])
|
||||
|
||||
data = data[(data['chol'] <= 420) & (data['oldpeak'] >=0) & (data['oldpeak'] <=4)].reset_index(drop=True)
|
||||
data = data.dropna().reset_index(drop=True)
|
||||
print(data.info())
|
||||
|
||||
data = str_features_to_numeric(data)
|
||||
data = data[data['target'].isin([0, 1])] # приволим столбец с целевыми значениями к бинарному виду
|
||||
|
||||
data = fe_creation(data)
|
||||
data = str_features_to_numeric(data)
|
||||
|
||||
dataset = data.copy() # original data
|
||||
target_name = 'target'
|
||||
target = data.pop(target_name)
|
||||
|
||||
# Model standartization
|
||||
# The standard score of a sample x is calculated as:
|
||||
# z = (x - мат.ож.) / (стандартное отклонение)
|
||||
scaler = StandardScaler()
|
||||
data = pd.DataFrame(scaler.fit_transform(data), columns = data.columns)
|
||||
|
||||
train, valid, train_target, valid_target = train_test_split(data, target, test_size=test_train_split_part, random_state=random_state)
|
||||
|
||||
# list of accuracy of all model - amount of metrics_now * 2 (train & valid datasets)
|
||||
num_models = 6
|
||||
acc_train = []
|
||||
acc_valid = []
|
||||
acc_all = np.empty((len(metrics_now)*2, 0)).tolist()
|
||||
acc_all
|
||||
|
||||
acc_all_pred = np.empty((len(metrics_now), 0)).tolist()
|
||||
acc_all_pred
|
||||
|
||||
cv_train = ShuffleSplit(n_splits=cv_n_split, test_size=test_train_split_part, random_state=random_state)
|
||||
|
||||
decision_tree = DecisionTreeClassifier()
|
||||
param_grid = {'min_samples_leaf': [i for i in range(2,12)]}
|
||||
decision_tree_CV = GridSearchCV(decision_tree, param_grid=param_grid, cv=cv_train, verbose=False)
|
||||
decision_tree_CV.fit(train, train_target)
|
||||
print(decision_tree_CV.best_params_)
|
||||
|
||||
acc_all = acc_metrics_calc(0, acc_all, decision_tree_CV, train, valid, train_target, valid_target, title="Decision Tree Classifier")
|
||||
plot_learning_curve(decision_tree_CV, "Decision Tree", train, train_target, cv=cv_train)
|
||||
|
||||
feature_importances_dt = decision_tree_CV.best_estimator_.feature_importances_
|
||||
plot_feature_importance(feature_importances_dt, data.columns, "Decision Tree")
|
||||
|
||||
|
||||
BIN
romanova_adelina_lab_4/1.png
Normal file
|
After Width: | Height: | Size: 20 KiB |
BIN
romanova_adelina_lab_4/2.png
Normal file
|
After Width: | Height: | Size: 21 KiB |
BIN
romanova_adelina_lab_4/3.png
Normal file
|
After Width: | Height: | Size: 60 KiB |
76
romanova_adelina_lab_4/README.md
Normal file
@@ -0,0 +1,76 @@
|
||||
# Лабораторная работа №4. Вариант 21
|
||||
|
||||
## Тема:
|
||||
Кластеризация
|
||||
|
||||
## Модель:
|
||||
|
||||
KMeans
|
||||
|
||||
## Как запустить программу:
|
||||
Установить *python, numpy, matplotlib, sklearn*
|
||||
```
|
||||
python main.py
|
||||
```
|
||||
|
||||
## Какие технологии использовались:
|
||||
Язык программирования Python, библиотеки numpy, matplotlib, sklearn
|
||||
|
||||
Среда разработки VSCode
|
||||
|
||||
# Что делает лабораторная работа:
|
||||
|
||||
Задача кластеризации заключается в разделении множества данных на группы, называемые кластерами, таким образом, чтобы объекты внутри одного кластера были более похожи друг на друга, чем на объекты из других кластеров. Это позволяет выявлять скрытые структуры данных, облегчая последующий анализ и принятие решений.
|
||||
|
||||
В данной работе была рассмотрена модель ```KMeans```.
|
||||
|
||||
### Описание:
|
||||
```KMeans``` разбивает данные на K кластеров, где K - заранее заданное число. Он минимизирует сумму квадратов расстояний между точками данных и центрами своих соответствующих кластеров. Этот алгоритм прост в реализации и хорошо работает для сферических кластеров.
|
||||
|
||||
Кластеризация данных - это мощный инструмент для выделения закономерностей в больших объемах информации, и выбор конкретного алгоритма зависит от характера данных и поставленных задач. В данной работе мы рассмотрим эти алгоритмы более подробно, выявим их преимущества и недостатки, и проиллюстрируем их применение на практике.
|
||||
|
||||
Процесс получения кластеров происходит по следующему алгоритму:
|
||||
|
||||
1. Получаемый исходные данные
|
||||
|
||||
2. Приводим их всех к численному формату
|
||||
|
||||
3. Обучение на подготовленных данных
|
||||
|
||||
```
|
||||
def clustering_df(X, n, m, output_hist, title='clusters_by'):
|
||||
|
||||
X_columns = X.columns
|
||||
scaler = StandardScaler()
|
||||
scaler.fit(X)
|
||||
X = pd.DataFrame(scaler.transform(X), columns = X_columns)
|
||||
cl = generate_clustering_algorithms(X, n, m)
|
||||
cl.fit(X)
|
||||
if hasattr(cl, 'labels_'):
|
||||
labels = cl.labels_.astype(np.uint8)
|
||||
else:
|
||||
labels = cl.predict(X)
|
||||
clusters=pd.concat([X, pd.DataFrame({'cluster':labels})], axis=1)
|
||||
```
|
||||
Для кластеризации были выбраны все столбцы, часть кода представлена ниже:
|
||||
|
||||
```
|
||||
print(data.select_dtypes(include='object').columns.tolist())
|
||||
for column in data.select_dtypes(include='object').columns.tolist():
|
||||
data[column] = pd.factorize(data[column])[0]
|
||||
```
|
||||
Программа генерирует диаграммы для каждого кластера относительно всех признаков. Для меня наиболее интересным показались признаки возраста и наличия заболевания человека.
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
Изучая графики выше, мы можем сделать вывод, что люди из кластера №3 почти все болеют и большинство имеет 2,3 и 4 стадии. А возраст этих людей от 45 до 70 лет.
|
||||
|
||||
Ниже приложен результат обучения алгоритма кластеризации:
|
||||
|
||||

|
||||
|
||||
## Вывод
|
||||
|
||||
Я думаю, что алгоритм ```KMeans```справился достаточно хорошо, т.к. в нем каждый кластер получился обособленным, то есть более отличным от других кластеров. Следовательно, это может означать, что именно этот алгоритм смог понять ключевые признаки для каждого кластера.
|
||||
166
romanova_adelina_lab_4/main.py
Normal file
@@ -0,0 +1,166 @@
|
||||
import os
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
|
||||
from sklearn import cluster, mixture
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.cluster import KMeans, DBSCAN, OPTICS
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from sklearn.neighbors import kneighbors_graph
|
||||
from itertools import cycle, islice
|
||||
|
||||
import warnings
|
||||
warnings.simplefilter('ignore')
|
||||
|
||||
|
||||
def generate_clustering_algorithms(Z, n_clusters, m):
|
||||
# Generate clustering algorithms:
|
||||
# m = 'MeanShift', 'KMeans', 'MiniBatchKMeans'
|
||||
|
||||
# The minimal percentage of similarity of the clustered feature with "Survived" for inclusion in the final dataset
|
||||
limit_opt = 0.7
|
||||
|
||||
params = {'quantile': .2,
|
||||
'eps': .3,
|
||||
'damping': .9,
|
||||
'preference': -200,
|
||||
'n_neighbors': 10,
|
||||
'n_clusters': n_clusters,
|
||||
'min_samples': 3,
|
||||
'xi': 0.05,
|
||||
'min_cluster_size': 0.05}
|
||||
|
||||
# estimate bandwidth for mean shift
|
||||
bandwidth = cluster.estimate_bandwidth(Z, quantile=params['quantile'])
|
||||
|
||||
# connectivity matrix for structured Ward
|
||||
connectivity = kneighbors_graph(
|
||||
Z, n_neighbors=params['n_neighbors'], include_self=False)
|
||||
|
||||
# make connectivity symmetric
|
||||
connectivity = 0.5 * (connectivity + connectivity.T)
|
||||
|
||||
# ============
|
||||
# Create cluster objects
|
||||
# ============
|
||||
if m == 'MeanShift':
|
||||
cl = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
|
||||
elif m == 'KMeans':
|
||||
cl = cluster.KMeans(n_clusters=n_clusters, random_state = 1000)
|
||||
elif m == 'MiniBatchKMeans':
|
||||
cl = cluster.MiniBatchKMeans(n_clusters=n_clusters)
|
||||
|
||||
return cl
|
||||
|
||||
|
||||
def clustering_df(X, n, m, output_hist, title='clusters_by'):
|
||||
|
||||
# Standardization
|
||||
X_columns = X.columns
|
||||
scaler = StandardScaler()
|
||||
scaler.fit(X)
|
||||
X = pd.DataFrame(scaler.transform(X), columns = X_columns)
|
||||
cl = generate_clustering_algorithms(X, n, m)
|
||||
cl.fit(X)
|
||||
if hasattr(cl, 'labels_'):
|
||||
labels = cl.labels_.astype(np.uint8)
|
||||
else:
|
||||
labels = cl.predict(X)
|
||||
clusters=pd.concat([X, pd.DataFrame({'cluster':labels})], axis=1)
|
||||
|
||||
# Inverse Standardization
|
||||
X_inv = pd.DataFrame(scaler.inverse_transform(X), columns = X_columns)
|
||||
clusters_inv=pd.concat([X_inv, pd.DataFrame({'cluster':labels})], axis=1)
|
||||
|
||||
# Number of points in clusters
|
||||
print("Number of points in clusters:\n", clusters['cluster'].value_counts())
|
||||
|
||||
# Data in clusters - thanks to https://www.kaggle.com/sabanasimbutt/clustering-visualization-of-clusters-using-pca
|
||||
if output_hist:
|
||||
for c in clusters:
|
||||
grid = sns.FacetGrid(clusters_inv, col='cluster')
|
||||
grid.map(plt.hist, c)
|
||||
|
||||
plt.savefig(f'{title}_by_method_{m}.png')
|
||||
|
||||
return clusters, clusters_inv
|
||||
|
||||
|
||||
def plot_draw(X, title, m):
|
||||
# Drawing a plot with clusters on the plane (using PCA transformation)
|
||||
# Thanks to https://www.kaggle.com/sabanasimbutt/clustering-visualization-of-clusters-using-pca
|
||||
|
||||
dist = 1 - cosine_similarity(X)
|
||||
|
||||
# PCA transform
|
||||
pca = PCA(2)
|
||||
pca.fit(dist)
|
||||
X_PCA = pca.transform(dist)
|
||||
|
||||
# Generate point numbers and colors for clusters
|
||||
hsv = plt.get_cmap('hsv')
|
||||
n_clusters = max(X['cluster'].value_counts().index)-min(X['cluster'].value_counts().index)+2
|
||||
colors = list(hsv(np.linspace(0, 1, n_clusters)))
|
||||
colors_num = list(np.linspace(min(X['cluster'].value_counts().index), max(X['cluster'].value_counts().index), n_clusters))
|
||||
colors_num = [int(x) for x in colors_num]
|
||||
colors_str = [str(x) for x in colors_num]
|
||||
names_dict = dict(zip(colors_num, colors_str))
|
||||
colors_dict = dict(zip(colors_num, colors))
|
||||
|
||||
# Visualization
|
||||
x, y = X_PCA[:, 0], X_PCA[:, 1]
|
||||
|
||||
df = pd.DataFrame({'x': x, 'y':y, 'label':X['cluster'].tolist()})
|
||||
groups = df.groupby('label')
|
||||
|
||||
fig, ax = plt.subplots(figsize=(12, 8))
|
||||
|
||||
for name, group in groups:
|
||||
ax.plot(group.x, group.y, marker='o', linestyle='', ms=10,
|
||||
color=colors_dict[name],
|
||||
label=names_dict[name],
|
||||
mec='none')
|
||||
ax.set_aspect('auto')
|
||||
ax.tick_params(axis='x',which='both',bottom='off',top='off',labelbottom='off')
|
||||
ax.tick_params(axis= 'y',which='both',left='off',top='off',labelleft='off')
|
||||
|
||||
ax.legend(loc='upper right')
|
||||
ax.set_title(f"{title} by method {m}")
|
||||
plt.savefig(f'{title}_by_method_{m}.png')
|
||||
plt.show()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
data = pd.read_csv("..//heart_disease_uci.csv")
|
||||
data = data.drop_duplicates().reset_index(drop=True)
|
||||
|
||||
print(data.select_dtypes(include='object').columns.tolist())
|
||||
for column in data.select_dtypes(include='object').columns.tolist():
|
||||
data[column] = pd.factorize(data[column])[0]
|
||||
# print(pd.factorize(data[column])[0])
|
||||
|
||||
methods_all = ['KMeans', 'MiniBatchKMeans', 'MeanShift']
|
||||
n_default = 6
|
||||
|
||||
data = data[data.notna().all(axis=1)]
|
||||
|
||||
res = dict(zip(methods_all, [False]*len(methods_all)))
|
||||
n_clust = dict(zip(methods_all, [1]*len(methods_all)))
|
||||
for method in methods_all:
|
||||
print(f"Method - {method}")
|
||||
Y, Y_inv = clustering_df(data.copy(), n_default, method, True)
|
||||
|
||||
# If the number of clusters is less than 2, then the clustering is not successful
|
||||
n_cl = len(Y['cluster'].value_counts())
|
||||
if n_cl > 1:
|
||||
res[method] = True
|
||||
n_clust[method] = n_cl
|
||||
|
||||
plot_draw(Y, "Data clustering", method)
|
||||
else:
|
||||
print('Clustering is not successful because all data is in one cluster!\n')
|
||||
|
||||
BIN
romanova_adelina_lab_5/1.png
Normal file
|
After Width: | Height: | Size: 54 KiB |
79
romanova_adelina_lab_5/README.md
Normal file
@@ -0,0 +1,79 @@
|
||||
# Лабораторная работа №5. Вариант 21
|
||||
|
||||
## Тема:
|
||||
Регрессия
|
||||
|
||||
## Модель:
|
||||
|
||||
LinearRegression
|
||||
|
||||
## Как запустить программу:
|
||||
Установить *python, numpy, matplotlib, sklearn*
|
||||
```
|
||||
python lab.py
|
||||
```
|
||||
|
||||
## Какие технологии использовались:
|
||||
Язык программирования Python, библиотеки numpy, matplotlib, sklearn
|
||||
|
||||
Среда разработки VSCode
|
||||
|
||||
# Что делает лабораторная работа:
|
||||
|
||||
Поскольку артериальное давление пациента в состоянии покоя является важным медицинским показателем, оно было выбрано для предсказания на основе доступных признаков, таких как возраст, пол и других.
|
||||
|
||||
Внедрение линейной регрессии в решение задачи прогнозирования артериального давления в состоянии покоя приносит несколько ключевых преимуществ.
|
||||
|
||||
Линейная регрессия является мощным инструментом в области статистики и машинного обучения, широко применяемым для анализа и моделирования связей между зависимыми и независимыми переменными. Ее основная цель — построить линейную функцию, наилучшим образом приближающую отношение между входными данными и целевой переменной. Это позволяет предсказывать значения целевой переменной на основе новых входных данных.
|
||||
|
||||
### Описание:
|
||||
```LinearRegression``` - метод наименьших квадратов (MSE) – это основной принцип LinearRegression. Он стремится минимизировать сумму квадратов разностей между фактическими и предсказанными значениями. Этот алгоритм предоставляет аналитическое решение для определения коэффициентов линейной модели, что делает его эффективным и простым для понимания.
|
||||
|
||||
Процесс обучения линейной регрессии требует выполнения следующих шагов:
|
||||
|
||||
1. Получить исходные данные
|
||||
|
||||
2. Выбрать целевое значение, которые нужно предсказывать
|
||||
|
||||
3. Обработать данные таким образом, чтобы все признаки имели только числовой формат, и добавить нормализацию, или иначе, стандартизацию данных
|
||||
|
||||
4. 4. Провести обучение выбранной модели на подготовленных данных
|
||||
|
||||
Обработка данных происходит с помощью функции ```str_features_to_numeric```:
|
||||
|
||||
```
|
||||
def str_features_to_numeric(data):
|
||||
# Преобразовывает все строковые признаки в числовые.
|
||||
|
||||
# Определение категориальных признаков
|
||||
categorical_columns = []
|
||||
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
|
||||
features = data.columns.values.tolist()
|
||||
for col in features:
|
||||
if data[col].dtype in numerics: continue
|
||||
categorical_columns.append(col)
|
||||
|
||||
# Кодирование категориальных признаков
|
||||
for col in categorical_columns:
|
||||
if col in data.columns:
|
||||
le = LabelEncoder()
|
||||
le.fit(list(data[col].astype(str).values))
|
||||
data[col] = le.transform(list(data[col].astype(str).values))
|
||||
|
||||
return data
|
||||
```
|
||||
|
||||
Далее происходит нормализация с помощью ```StandardScaler```.
|
||||
|
||||
В качестве целевого признака был выбран артериальное давление в состоянии покоя ```trestbps```- артериальное давление в состоянии покоя (в мм рт. ст. при поступлении в больницу). Обработанные данные поступают на вход обучения модели линейной регресии:
|
||||
|
||||

|
||||
|
||||
- reg.score_ - отображает точность работы модели
|
||||
- reg.coef_ - отображает коэффициенты при признаках расположенных по порядку
|
||||
- reg.intercept_ - показывает параметр смещения (в английской литературе bias)
|
||||
|
||||
## Вывод
|
||||
|
||||
На основе полученных результатов, можно сказать, что классическая модель линейной регрессии является более чем подходящей для решения именно этой конкретной задачи
|
||||
|
||||
87
romanova_adelina_lab_5/lab.py
Normal file
@@ -0,0 +1,87 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
import sklearn
|
||||
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge
|
||||
|
||||
|
||||
from sklearn.preprocessing import (LabelEncoder,
|
||||
StandardScaler,
|
||||
MinMaxScaler,
|
||||
RobustScaler)
|
||||
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, learning_curve, ShuffleSplit
|
||||
|
||||
|
||||
def str_features_to_numeric(data):
|
||||
# Преобразовывает все строковые признаки в числовые.
|
||||
|
||||
# Определение категориальных признаков
|
||||
categorical_columns = []
|
||||
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
|
||||
features = data.columns.values.tolist()
|
||||
for col in features:
|
||||
if data[col].dtype in numerics: continue
|
||||
categorical_columns.append(col)
|
||||
|
||||
# Кодирование категориальных признаков
|
||||
for col in categorical_columns:
|
||||
if col in data.columns:
|
||||
le = LabelEncoder()
|
||||
le.fit(list(data[col].astype(str).values))
|
||||
data[col] = le.transform(list(data[col].astype(str).values))
|
||||
|
||||
return data
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
data = pd.read_csv("..//heart_disease_uci.csv")
|
||||
data['target'] = data['trestbps']
|
||||
data = data.drop(columns=['id', 'dataset', 'trestbps'])
|
||||
|
||||
data_wo_null = data.dropna()
|
||||
print(len(data_wo_null))
|
||||
|
||||
encoded_data_wo_null = str_features_to_numeric(data_wo_null)
|
||||
print(len(encoded_data_wo_null))
|
||||
|
||||
# Model standartization
|
||||
# The standard score of a sample x is calculated as:
|
||||
# z = (x - мат.ож.) / (стандартное отклонение)
|
||||
scaler = StandardScaler()
|
||||
new_data = pd.DataFrame(scaler.fit_transform(encoded_data_wo_null), columns = encoded_data_wo_null.columns)
|
||||
|
||||
dataset = data_wo_null.copy() # original data
|
||||
target_name = 'target'
|
||||
target = data_wo_null.pop(target_name)
|
||||
|
||||
test_train_split_part = 0.2
|
||||
random_state = 42
|
||||
|
||||
train, valid, train_target, valid_target = train_test_split(new_data, target,
|
||||
test_size=test_train_split_part,
|
||||
random_state=random_state)
|
||||
|
||||
reg = LinearRegression().fit(train, train_target)
|
||||
|
||||
print("---"*15, " LinearRegression ", "---"*15)
|
||||
print(f"Accuracy: {reg.score(valid, valid_target)}")
|
||||
print(f"коэффициенты: {reg.coef_}")
|
||||
print(f"Смещение относительно начала координат (bias): {reg.intercept_}")
|
||||
|
||||
SGD_reg = SGDRegressor(max_iter=1000, tol=1e-3)
|
||||
SGD_reg.fit(train, train_target)
|
||||
|
||||
print("---"*15, " SGDRegressor ", "---"*15)
|
||||
print(f"Accuracy: {SGD_reg.score(valid, valid_target)}")
|
||||
print(f"коэффициенты: {SGD_reg.coef_}")
|
||||
print(f"Смещение относительно начала координат (bias): {SGD_reg.intercept_}")
|
||||
|
||||
Ridge_clf = Ridge(alpha=1.0)
|
||||
Ridge_clf.fit(train, train_target)
|
||||
|
||||
print("---"*15, " Ridge ", "---"*15)
|
||||
print(f"Accuracy: {Ridge_clf.score(valid, valid_target)}")
|
||||
print(f"коэффициенты: {Ridge_clf.coef_}")
|
||||
print(f"Смещение относительно начала координат (bias): {Ridge_clf.intercept_}")
|
||||
|
||||
BIN
romanova_adelina_lab_6/1.png
Normal file
|
After Width: | Height: | Size: 158 KiB |
BIN
romanova_adelina_lab_6/2.png
Normal file
|
After Width: | Height: | Size: 47 KiB |
BIN
romanova_adelina_lab_6/3.png
Normal file
|
After Width: | Height: | Size: 48 KiB |
BIN
romanova_adelina_lab_6/4.png
Normal file
|
After Width: | Height: | Size: 98 KiB |
47
romanova_adelina_lab_6/README.md
Normal file
@@ -0,0 +1,47 @@
|
||||
# Лабораторная работа №6. Вариант 21
|
||||
|
||||
## Тема:
|
||||
Нейронная сеть
|
||||
|
||||
## Модель:
|
||||
|
||||
MLPClassifier
|
||||
|
||||
## Как запустить программу:
|
||||
Установить *python, numpy, matplotlib, sklearn*
|
||||
```
|
||||
python lab.py
|
||||
```
|
||||
|
||||
## Какие технологии использовались:
|
||||
Язык программирования Python, библиотеки numpy, matplotlib, sklearn
|
||||
|
||||
Среда разработки VSCode
|
||||
|
||||
# Что делает лабораторная работа:
|
||||
|
||||
В ходе исследования нейронных сетей, в особенности многослойных перцептронов (MLP), был проведен тщательный анализ влияния архитектуры сети на её производительность в задаче классификации стадий сердечных заболеваний. Эксперименты с различными конфигурациями слоев и их размерами позволили более глубоко понять, какие параметры сети оказывают наибольшее влияние на точность прогнозов.
|
||||
|
||||
В качестве MLP в коде использовался класс ```sklearn.neural_network.MLPClassifier``` и целевой задачей являлось предсказание наличие болезни сердца (0 - отсутствует, а 1,2,3,4 - стадии)
|
||||
|
||||
Процесс подготовки данных и обучение MLP представлен на изображении ниже и ```качество оценки составило 0.83```, данное число представляет точность оценки и вычисляется как отношение правильных ответов к общему количеству ответов. Важно отметить, что данный MLP состоял только из ```одного скрытого слоя с размером = 100```.
|
||||
|
||||

|
||||
|
||||
При MLP, содержащим два скрытых состояния с размерами ```300``` и ```100``` соответственно получилось добиться ```точности в примерно 0.92```.
|
||||
|
||||

|
||||
|
||||
При MLP, содержащим четыре скрытых состояния с размерами ```150, 100, 50 и 50 ```соответственно получилось добиться ```точности в 0.95```.
|
||||
|
||||

|
||||
|
||||
При MLP, который содержит 5 слоев с размерами ```100, 400, 600, 400, 100```, то есть самая большая с точки зрения архитектуры модель имеет наилучший показать точности.
|
||||
|
||||

|
||||
|
||||
## Вывод
|
||||
|
||||
На основе проведенных экспериментов можно сделать вывод, что при усложнении архитектуры нейронной сети мы получаем улучшение в ее качестве.
|
||||
|
||||

|
||||
86
romanova_adelina_lab_6/lab.py
Normal file
@@ -0,0 +1,86 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
import sklearn
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
import argparse
|
||||
|
||||
from sklearn.preprocessing import (LabelEncoder,
|
||||
StandardScaler,
|
||||
MinMaxScaler,
|
||||
RobustScaler)
|
||||
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, learning_curve, ShuffleSplit
|
||||
|
||||
|
||||
def get_arguments():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('--id_pred', type=int, default=1, help='Какой id из тестовой выборки будем предсказывать')
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def str_features_to_numeric(data):
|
||||
# Преобразовывает все строковые признаки в числовые.
|
||||
|
||||
# Определение категориальных признаков
|
||||
categorical_columns = []
|
||||
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
|
||||
features = data.columns.values.tolist()
|
||||
for col in features:
|
||||
if data[col].dtype in numerics: continue
|
||||
categorical_columns.append(col)
|
||||
|
||||
# Кодирование категориальных признаков
|
||||
for col in categorical_columns:
|
||||
if col in data.columns:
|
||||
le = LabelEncoder()
|
||||
le.fit(list(data[col].astype(str).values))
|
||||
data[col] = le.transform(list(data[col].astype(str).values))
|
||||
|
||||
return data
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = get_arguments()
|
||||
|
||||
data = pd.read_csv("..//heart_disease_uci.csv")
|
||||
data['target'] = data['num']
|
||||
data = data.drop(columns=['id', 'dataset', 'num'])
|
||||
|
||||
data_wo_null = data.dropna()
|
||||
print(len(data_wo_null))
|
||||
data_wo_null.head(3)
|
||||
|
||||
encoded_data_wo_null = str_features_to_numeric(data_wo_null)
|
||||
|
||||
scaler = StandardScaler()
|
||||
new_data = pd.DataFrame(scaler.fit_transform(encoded_data_wo_null), columns = encoded_data_wo_null.columns)
|
||||
|
||||
dataset = data_wo_null.copy() # original data
|
||||
target_name = 'target'
|
||||
target = data_wo_null.pop(target_name)
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(new_data, target, test_size=0.2, random_state=42)
|
||||
|
||||
clf = MLPClassifier(random_state=42, max_iter=300, hidden_layer_sizes=(100)).fit(X_train, y_train)
|
||||
print("---"*15, " MLPClassifier(100) ", "---"*15)
|
||||
print(f"Accuracy: {clf.score(X_test, y_test)}")
|
||||
|
||||
clf2 = MLPClassifier(random_state=42, max_iter=300, hidden_layer_sizes=(300, 100)).fit(X_train, y_train)
|
||||
print("---"*15, " MLPClassifier(300, 100) ", "---"*15)
|
||||
print(f"Accuracy: {clf2.score(X_test, y_test)}")
|
||||
|
||||
clf3 = MLPClassifier(random_state=42, max_iter=300, hidden_layer_sizes=(150, 100, 50, 50)).fit(X_train, y_train)
|
||||
print("---"*15, " MLPClassifier(150, 100, 50, 50) ", "---"*15)
|
||||
print(f"Accuracy: {clf3.score(X_test, y_test)}")
|
||||
|
||||
clf4 = MLPClassifier(random_state=42, max_iter=300, hidden_layer_sizes=(100, 400, 600, 400, 100)).fit(X_train, y_train)
|
||||
print("---"*15, " MLPClassifier(100, 400, 600, 400, 100) ", "---"*15)
|
||||
print(f"Accuracy: {clf4.score(X_test, y_test)}")
|
||||
|
||||
print("---"*15, f" Предсказание элемента под id = {args.id_pred}", "---"*15)
|
||||
print(f"Предсказанное значение: {clf3.predict(np.array(list(X_test.iloc[args.id_pred])).reshape(1, -1))}")
|
||||
print(f"Настоящее значение {y_test.iloc[args.id_pred]}")
|
||||
BIN
romanova_adelina_lab_6/res.png
Normal file
|
After Width: | Height: | Size: 23 KiB |
52
romanova_adelina_lab_7/README.md
Normal file
@@ -0,0 +1,52 @@
|
||||
# Лабораторная работа №7. Вариант 21
|
||||
|
||||
## Тема
|
||||
|
||||
Рекуррентная нейронная сеть и задача генерации текста
|
||||
|
||||
## Задание
|
||||
|
||||
- Выбрать художественный текст и обучить на нем рекуррентную нейронную сеть для решения задачи генерации.
|
||||
|
||||
- Подобрать архитектуру и параметры так, чтобы приблизиться к максимально осмысленному результату.
|
||||
|
||||
## Используемые ресурсы
|
||||
|
||||
1. Художественный текст на английском языке ```wonderland.txt```
|
||||
|
||||
2. Python-скрипты: ```generate.py```, ```model.py```, ```train.py```.
|
||||
|
||||
## Описание работы
|
||||
|
||||
### Подготовка данных:
|
||||
В файле ```train.py``` реализована функция ```get_data```, которая загружает художественный текст, приводит его к нижнему регистру, и создает сопоставление символов числовым значениям.
|
||||
|
||||
Текст разбивается на последовательности фиксированной длины ```seq_length```, и каждая последовательность связывается с символом, следующим за ней.
|
||||
|
||||
Данные приводятся к тензорам PyTorch и нормализуются для обучения модели.
|
||||
|
||||
### Архитектура модели:
|
||||
|
||||
В файле ```model.py``` определен класс ```CharModel```, наследуемый от ```nn.Module``` и представляющий собой рекуррентную нейронную сеть.
|
||||
|
||||
Архитектура модели включает в себя один слой LSTM с размером скрытого состояния 256, слой dropout для регуляризации и линейный слой для вывода результатов.
|
||||
|
||||
### Обучение модели:
|
||||
|
||||
В файле ```train.py``` реализован скрипт для обучения модели. Выбрана оптимизация Adam, функция потерь - ```CrossEntropyLoss```.
|
||||
|
||||
Обучение происходит на GPU, если он доступен. Обучение проводится в течение нескольких эпох, с валидацией на каждой эпохе. Сохраняется лучшая модель.
|
||||
|
||||
Процесс обучения модели:
|
||||
|
||||

|
||||
|
||||
### Генерация текста:
|
||||
|
||||
В файле ```generate.py``` модель загружается из сохраненного состояния. Генерируется случайный промпт из исходного текста, и модель используется для предсказания следующего символа в цикле.
|
||||
|
||||
## Вывод:
|
||||
|
||||

|
||||
|
||||
В сгенерированном тексте можно найти осмысленные участки, поэтому можно сделать вывод, что модель действительно хорошо обучилась.
|
||||
46
romanova_adelina_lab_7/generate.py
Normal file
@@ -0,0 +1,46 @@
|
||||
import torch
|
||||
from model import CharModel
|
||||
import numpy as np
|
||||
|
||||
if __name__ == "__main__":
|
||||
best_model, char_to_int = torch.load("single-char.pth")
|
||||
n_vocab = len(char_to_int)
|
||||
int_to_char = dict((i, c) for c, i in char_to_int.items())
|
||||
|
||||
|
||||
model = CharModel()
|
||||
model.load_state_dict(best_model)
|
||||
|
||||
# randomly generate a prompt
|
||||
filename = "wonderland.txt"
|
||||
seq_length = 100
|
||||
raw_text = open(filename, 'r', encoding='utf-8').read()
|
||||
raw_text = raw_text.lower()
|
||||
|
||||
start = np.random.randint(0, len(raw_text)-seq_length)
|
||||
prompt = raw_text[start:start+seq_length]
|
||||
pattern = [char_to_int[c] for c in prompt]
|
||||
|
||||
model.eval()
|
||||
print(f'Prompt:\n{prompt}')
|
||||
print("==="*15, "Сгенерированный результ", "==="*15, sep=" ")
|
||||
|
||||
with torch.no_grad():
|
||||
for i in range(1000):
|
||||
# format input array of int into PyTorch tensor
|
||||
x = np.reshape(pattern, (1, len(pattern), 1)) / float(n_vocab)
|
||||
x = torch.tensor(x, dtype=torch.float32)
|
||||
# generate logits as output from the model
|
||||
prediction = model(x)
|
||||
# convert logits into one character
|
||||
index = int(prediction.argmax())
|
||||
result = int_to_char[index]
|
||||
print(result, end="")
|
||||
# append the new character into the prompt for the next iteration
|
||||
pattern.append(index)
|
||||
pattern = pattern[1:]
|
||||
|
||||
print()
|
||||
print("==="*30)
|
||||
print("Done.")
|
||||
|
||||
BIN
romanova_adelina_lab_7/generated_text.png
Normal file
|
After Width: | Height: | Size: 42 KiB |
16
romanova_adelina_lab_7/model.py
Normal file
@@ -0,0 +1,16 @@
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
class CharModel(nn.Module):
|
||||
def __init__(self, n_vocab):
|
||||
super().__init__()
|
||||
self.lstm = nn.LSTM(input_size=1, hidden_size=256, num_layers=1, batch_first=True)
|
||||
self.dropout = nn.Dropout(0.2)
|
||||
self.linear = nn.Linear(256, n_vocab)
|
||||
def forward(self, x):
|
||||
x, _ = self.lstm(x)
|
||||
# take only the last output
|
||||
x = x[:, -1, :]
|
||||
# produce output
|
||||
x = self.linear(self.dropout(x))
|
||||
return x
|
||||
BIN
romanova_adelina_lab_7/single-char.pth
Normal file
86
romanova_adelina_lab_7/train.py
Normal file
@@ -0,0 +1,86 @@
|
||||
import numpy as np
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
import torch.utils.data as data
|
||||
import torch
|
||||
from model import CharModel
|
||||
|
||||
|
||||
def get_data(filename="wonderland.txt"):
|
||||
# загружаем датасет и приводим к нижнему регистру
|
||||
filename = "wonderland.txt"
|
||||
raw_text = open(filename, 'r', encoding='utf-8').read()
|
||||
raw_text = raw_text.lower()
|
||||
|
||||
# делаем сопоставление текста с соответствующим ему значением
|
||||
chars = sorted(list(set(raw_text)))
|
||||
char_to_int = dict((c, i) for i, c in enumerate(chars))
|
||||
|
||||
# статистика обучаемых данных
|
||||
n_chars = len(raw_text)
|
||||
n_vocab = len(chars)
|
||||
print("Total Characters: ", n_chars)
|
||||
print("Total Vocab: ", n_vocab)
|
||||
|
||||
# подготовка датасета
|
||||
seq_length = 100
|
||||
dataX = []
|
||||
dataY = []
|
||||
for i in range(0, n_chars - seq_length, 1):
|
||||
seq_in = raw_text[i:i + seq_length]
|
||||
seq_out = raw_text[i + seq_length]
|
||||
dataX.append([char_to_int[char] for char in seq_in])
|
||||
dataY.append(char_to_int[seq_out])
|
||||
n_patterns = len(dataX)
|
||||
print("Total Patterns: ", n_patterns)
|
||||
|
||||
# --- переводим данные к тензору, чтобы рабоать с ними внутри pytorch ---
|
||||
X = torch.tensor(dataX, dtype=torch.float32).reshape(n_patterns, seq_length, 1)
|
||||
X = X / float(n_vocab)
|
||||
y = torch.tensor(dataY)
|
||||
print(X.shape, y.shape)
|
||||
|
||||
return X, y, char_to_int
|
||||
|
||||
|
||||
def main():
|
||||
X, y, char_to_int = get_data()
|
||||
|
||||
n_epochs = 40
|
||||
batch_size = 128
|
||||
model = CharModel()
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
print(f"device: {device}")
|
||||
model.to(device)
|
||||
|
||||
optimizer = optim.Adam(model.parameters())
|
||||
loss_fn = nn.CrossEntropyLoss(reduction="sum")
|
||||
loader = data.DataLoader(data.TensorDataset(X, y), shuffle=True, batch_size=batch_size)
|
||||
|
||||
best_model = None
|
||||
best_loss = np.inf
|
||||
|
||||
for epoch in range(n_epochs):
|
||||
model.train()
|
||||
for X_batch, y_batch in loader:
|
||||
y_pred = model(X_batch.to(device))
|
||||
loss = loss_fn(y_pred, y_batch.to(device))
|
||||
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
# Validation
|
||||
model.eval()
|
||||
loss = 0
|
||||
with torch.no_grad():
|
||||
for X_batch, y_batch in loader:
|
||||
y_pred = model(X_batch.to(device))
|
||||
loss += loss_fn(y_pred, y_batch.to(device))
|
||||
if loss < best_loss:
|
||||
best_loss = loss
|
||||
best_model = model.state_dict()
|
||||
print("Epoch %d: Cross-entropy: %.4f" % (epoch, loss))
|
||||
|
||||
torch.save([best_model, char_to_int], "single-char.pth")
|
||||
|
||||
BIN
romanova_adelina_lab_7/train_process.png
Normal file
|
After Width: | Height: | Size: 45 KiB |
3375
romanova_adelina_lab_7/wonderland.txt
Normal file
120
zhukova_alina_lab_7/flask-server.py
Normal file
@@ -0,0 +1,120 @@
|
||||
from sre_parse import Tokenizer
|
||||
|
||||
import numpy as np
|
||||
from flask import Flask
|
||||
from keras.layers import Dense, LSTM, Embedding
|
||||
from keras.models import load_model, Sequential
|
||||
from keras_preprocessing.sequence import pad_sequences
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
|
||||
@app.route("/")
|
||||
def home():
|
||||
return "<html>" \
|
||||
"<h1>Жукова Алина ПИбд-41</h1>" \
|
||||
"<h1>Лабораторная работа №7</h1>" \
|
||||
"<table>" \
|
||||
"<td>" \
|
||||
"<form Action='http://127.0.0.1:5000/k4_1_task_7' Method=get>" \
|
||||
"<input type=submit value='Генерация текста'>" \
|
||||
"</form>" \
|
||||
"</td>" \
|
||||
"</table>" \
|
||||
"</html>"
|
||||
|
||||
# Реккурентная нейронная сеть, генерация текста
|
||||
# 10 вариант
|
||||
@app.route("/k4_1_task_7", methods=['GET'])
|
||||
def k4_1_task_7():
|
||||
# Загрузка текста из файла
|
||||
# Русский текст
|
||||
# with open('lab_4_1__7_text_rus.txt', 'r', encoding='utf-8') as file:
|
||||
# text = file.read()
|
||||
# Анлглийский текст
|
||||
with open('lab_4_1__7_text_eng.txt', 'r', encoding='utf-8') as file:
|
||||
text = file.read()
|
||||
|
||||
# Создание Tokenizer и токенизация текста
|
||||
tokenizer = Tokenizer(char_level=True)
|
||||
tokenizer.fit_on_texts(text)
|
||||
|
||||
# Преобразование текста в последовательность чисел
|
||||
sequences = tokenizer.texts_to_sequences(text)
|
||||
|
||||
# Подготовка обучающих данных
|
||||
seq_length = 100
|
||||
dataX, dataY = [], []
|
||||
for i in range(0, len(sequences) - seq_length):
|
||||
seq_in = sequences[i:i + seq_length]
|
||||
seq_out = sequences[i + seq_length]
|
||||
dataX.append(seq_in)
|
||||
dataY.append(seq_out)
|
||||
|
||||
dataX = np.array(dataX)
|
||||
dataY = np.array(dataY)
|
||||
|
||||
# Создание модели
|
||||
vocab_size = len(tokenizer.word_index) + 1
|
||||
embedding_dim = 256
|
||||
rnn_units = 1024
|
||||
|
||||
model = Sequential()
|
||||
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=seq_length))
|
||||
model.add(LSTM(units=rnn_units))
|
||||
model.add(Dense(units=vocab_size, activation='softmax'))
|
||||
|
||||
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
|
||||
|
||||
# Загрузка модели
|
||||
# РУсская модель
|
||||
# model = load_model('lab_4_1__7_model.keras')
|
||||
# Английская модель
|
||||
model = load_model('lab_4_1__7_model_eng.keras')
|
||||
print("Loaded model from disk")
|
||||
|
||||
|
||||
# Обучение модели
|
||||
# batch_size = 64
|
||||
# model.fit(dataX, dataY, epochs=15, batch_size=batch_size)
|
||||
|
||||
|
||||
def generate_text(seed_text, gen_length):
|
||||
generated_text = seed_text
|
||||
|
||||
for _ in range(gen_length):
|
||||
sequence = tokenizer.texts_to_sequences([seed_text])[0]
|
||||
sequence = pad_sequences([sequence], maxlen=seq_length)
|
||||
prediction = model.predict(sequence)[0]
|
||||
predicted_index = np.argmax(prediction)
|
||||
predicted_char = tokenizer.index_word[predicted_index]
|
||||
generated_text += predicted_char
|
||||
seed_text += predicted_char
|
||||
seed_text = seed_text[1:]
|
||||
|
||||
return generated_text
|
||||
|
||||
# Пример использования
|
||||
start_phraze = "Black cat"
|
||||
# Русский
|
||||
# generated_text = generate_text("Невероятный котик", 250)
|
||||
# Английский
|
||||
generated_text = generate_text(start_phraze, 250)
|
||||
|
||||
i = 10
|
||||
|
||||
# Сохранение модели
|
||||
# Русская модель
|
||||
# model.save('C:/Users/Alina/PycharmProjects/lab1/lab_4_1__7_model.keras')
|
||||
# Английская модель
|
||||
# model.save('C:/Users/Alina/PycharmProjects/lab1/lab_4_1__7_model_eng.keras')
|
||||
# print("Saved model to disk")
|
||||
|
||||
return "<html>" \
|
||||
"<h1></h1>" \
|
||||
"<h2>Вариант 10. Задание 7 - Генерация текста</h2>" \
|
||||
"<h2> Сгенерированный текст, начальная фраза " + start_phraze + ": " + str(generated_text) + " </h2>" \
|
||||
"</html>"
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run(debug=True)
|
||||
BIN
zhukova_alina_lab_7/img_screen_1.png
Normal file
|
After Width: | Height: | Size: 14 KiB |
BIN
zhukova_alina_lab_7/img_screen_2.png
Normal file
|
After Width: | Height: | Size: 19 KiB |
BIN
zhukova_alina_lab_7/img_screen_3.png
Normal file
|
After Width: | Height: | Size: 20 KiB |
BIN
zhukova_alina_lab_7/img_screen_4.png
Normal file
|
After Width: | Height: | Size: 16 KiB |
BIN
zhukova_alina_lab_7/img_screen_5.png
Normal file
|
After Width: | Height: | Size: 14 KiB |
1
zhukova_alina_lab_7/lab_4_1__7_text_eng.txt
Normal file
@@ -0,0 +1 @@
|
||||
Cats are wonderful pets that many people love. They come in different colors and sizes. Cats are known for being playful and sometimes a bit lazy. These furry friends have soft paws and sharp claws. They use their claws for various things, like scratching to keep their claws healthy and marking their territory. Cats also have a special ability to land on their feet if they fall, which is really impressive. Cats enjoy their independence. They often like to explore their surroundings and might hide in cozy spots. They clean themselves by licking their fur and are usually very clean animals. Cats like to communicate with us using different sounds, like meowing, purring, and even hissing if they’re scared. They also use their tails to show how they feel. A wagging tail might mean they’re excited, while a puffed-up tail can mean they’re scared. Playing with cats using toys like balls or strings is lots of fun, and it keeps them active. They also like to nap a lot during the day. If you’re thinking of having a cat as a pet, remember to give them love, care, and a cozy place to sleep. In short, cats are lovely pets with soft fur and sharp claws. They’re independent, playful, and great at keeping clean. Cats talk to us with sounds and tails, and they enjoy playing and napping. If you have a cat, make sure to give them care and affection. I have a cat. Her name is Matilda. She is seven years old. She is grey with a few dark spots. Matilda has green eyes. She is quite fluffy and big. When we brought Matilda home, she was a little two-month-old kitten. At first, she was scared to leave a little blanket that my mom laid for her in the kitchen. A few days later, she started to explore the surroundings. Matilda is a very smart pet. She is a bit shy and always hides somewhere when we have guests. She is very independent and likes solitude. When Matilda was a little kitten she loved to play with different toys, balls and strings. Now most of the time she spends lying on the sofa or the armchair. We buy her cat food in the shop because this is all she eats. Apart from that she is also fond of fresh cucumbers. There are two dishes for her in the kitchen. One is filled with food, and the other contains water. My cat eats three times a day and likes drinking directly from the tap. Matilda rarely shows affection. Sometimes it seems that my mom is her most loved person. She likes to climb my mom’s lap and stay there for some time. Matilda purrs when my mom strokes her. It is hard to make Matilda sit on somebody else’s lap. Nevertheless, I love Matilda very much! She is an amazing cat with immaculate manners.
|
||||
1
zhukova_alina_lab_7/lab_4_1__7_text_rus.txt
Normal file
80
zhukova_alina_lab_7/readme.md
Normal file
@@ -0,0 +1,80 @@
|
||||
## Задание
|
||||
Рекуррентная нейронная сеть и задача генерации текста
|
||||
Выбрать тексты на русском и английском языках.
|
||||
Обучить нейронную сеть и подобрать параметры таким образом,
|
||||
чтобы максимально приблизиться к осмысленному результату.
|
||||
Интерпретировать результаты
|
||||
|
||||
Вариант №10
|
||||
|
||||
## Используемые технологии
|
||||
В лабораторной были использованы библиотеки:
|
||||
+ keras - используется для работы с реккурентной нейронной сетью и методами машинного обучения
|
||||
+ numpy - позволяет работать с массивами и матрицами
|
||||
+ Flask - предоставляет способ быстрого создания веб-страниц для визуализации работы приложения
|
||||
|
||||
## Используемые компоненты
|
||||
+ Tokenizer - инструмент для токенизации текста, подсчитывает частоту вхождения слов
|
||||
+ Sequential - предоставляет линейный набор слоев нейронной сети
|
||||
+ Embedding - слой keras, который преобразует целочисленный последовательности
|
||||
в плотные векторы
|
||||
+ LSTM - представляет особую структуру реккурентной нейронной сети,
|
||||
способную к обучению долговременными зависимостями
|
||||
+ Dense - предоставляет плотные(полносвязные) слои нейросети
|
||||
|
||||
## Как запустить
|
||||
Запустить файл flask-server, который поднимет локальный сервер
|
||||
и позволит обратиться к программе через браузер по ссылке [http://127.0.0.1:5000/](http://127.0.0.1:5000/)
|
||||
|
||||
## Что делает программа
|
||||
В зависимости от параметров в коде, обучается и генерирует русский или английский текст.
|
||||
В первую очередь считанный из файла текст проходит токенизацию, строится модель нейронной сети.
|
||||
Далее полученная реккурентная нейронная сеть обучается, а после генерирует 250 символов
|
||||
исходя из начальной фразы
|
||||
|
||||
## Анализ
|
||||
|
||||
Наиболее качественные результаты модель предоставляет при 15 итерациях обучения.
|
||||
Если итераций недостаточно, модель генерирует не слова, а одно и то же сочетание букв,
|
||||
разделяя сочетания пробелами.
|
||||
|
||||
На 15 итерациях с текстом величиной в 3 400 знаков, время обучения составило около 5-6 часов.
|
||||
Потому в программе реализованы функции сохранения и загрузки моделей, чтобы не обучать нейросетей заново,
|
||||
а дообучать на тех же или новых данных.
|
||||
|
||||
На выбранной архитектуре сети при введении новых данных качество работы модели значительно снижалось.
|
||||
При работе на одних и тех же данных выполнение 15 итераций достаточно, чтобы сеть могла генерировать
|
||||
слова из текста. Однако осмысленного текста добиться не удалось. Очевидно это связано с
|
||||
выбранной архитектурой, нехватке данных и недостаточном обучении. Можно сделать вывод о том,
|
||||
что обучить нейросеть генерировать осмысленный текст на имеющейся аппаратуре если и возможно,
|
||||
то очень затратно.
|
||||
|
||||
## Скриншоты работы программы
|
||||
|
||||
**Наилучший результат генерации русского текста.**
|
||||
|
||||
Генерация русского текста при достаточном обучении. Почти все сгенерированные слова являются реальными.
|
||||
Однако полученный текст все же нельзя назвать осмысленным
|
||||

|
||||
|
||||
**Остальные варианты**
|
||||
|
||||
Генерация русского текста при недостаточном обучении. Вместо слов создаются сочетания букв,
|
||||
которые очень быстро зацикливаются.
|
||||

|
||||
|
||||
Генерация русского текста при переобучении. Нейросеть не генерирует новый текст, она переписывает
|
||||
фрагмент из обучающих данных.
|
||||

|
||||
|
||||
В случае с английским текстом все происходит точно также.
|
||||
|
||||
**Наилучший результат генерации английского текста.**
|
||||
|
||||
Генерация английского текста при достаточном обучении. Почти все сгенерированные слова являются реальными.
|
||||
Однако полученный текст все же нельзя назвать осмысленным
|
||||

|
||||
|
||||
При генерации английских текстов зацикление происходит быстрее. Возможно
|
||||
это связано с меньшим количеством используемых символов.
|
||||

|
||||