Лаба 13
This commit is contained in:
parent
b0e78b3f6d
commit
3e7b17abf0
47
lab13/main.py
Normal file
47
lab13/main.py
Normal file
@ -0,0 +1,47 @@
|
||||
import math
|
||||
import time
|
||||
|
||||
import pymystem3
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
def filter_stop_words(text):
|
||||
m = pymystem3.Mystem()
|
||||
analysis = m.analyze(text)
|
||||
filtered_words = [word for word in analysis if
|
||||
'analysis' in word and word['analysis'] and word['analysis'][0]['gr'] not in ['PR', 'INTJ', 'NUM', 'PART']]
|
||||
|
||||
return filtered_words
|
||||
|
||||
|
||||
def count_feminine_nouns(analysis):
|
||||
feminine_nouns = 0
|
||||
for word in analysis:
|
||||
if 'analysis' in word and 'жен' in word['analysis'][0]['gr'] and 'S' in word['analysis'][0]['gr']:
|
||||
feminine_nouns += 1
|
||||
return feminine_nouns
|
||||
|
||||
|
||||
def find_significant_bigrams(analysis):
|
||||
bigrams = defaultdict(int)
|
||||
for i in range(len(analysis) - 1):
|
||||
word1 = analysis[i]['analysis'][0]['lex']
|
||||
word2 = analysis[i + 1]['analysis'][0]['lex']
|
||||
bigrams[(word1, word2)] += 1
|
||||
significant_bigrams = []
|
||||
for bigram, count in bigrams.items():
|
||||
word1_count = sum(1 for word in analysis if word['analysis'][0]['lex'] == bigram[0])
|
||||
word2_count = sum(1 for word in analysis if word['analysis'][0]['lex'] == bigram[1])
|
||||
expected_count = (word1_count * word2_count) / len(analysis)
|
||||
mi = count * math.log(count / expected_count, 2)
|
||||
significant_bigrams.append((bigram, mi))
|
||||
return sorted(significant_bigrams, key=lambda x: x[1], reverse=True)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
text = open('input.txt', encoding='utf8').read()
|
||||
start = time.time()
|
||||
analysis = filter_stop_words(text)
|
||||
print(f"{time.time() - start} sec.")
|
||||
print(count_feminine_nouns(analysis))
|
||||
print(find_significant_bigrams(analysis))
|
Loading…
Reference in New Issue
Block a user