48 lines
1.6 KiB
Python
48 lines
1.6 KiB
Python
|
import math
|
||
|
import time
|
||
|
|
||
|
import pymystem3
|
||
|
from collections import defaultdict
|
||
|
|
||
|
|
||
|
def filter_stop_words(text):
|
||
|
m = pymystem3.Mystem()
|
||
|
analysis = m.analyze(text)
|
||
|
filtered_words = [word for word in analysis if
|
||
|
'analysis' in word and word['analysis'] and word['analysis'][0]['gr'] not in ['PR', 'INTJ', 'NUM', 'PART']]
|
||
|
|
||
|
return filtered_words
|
||
|
|
||
|
|
||
|
def count_feminine_nouns(analysis):
|
||
|
feminine_nouns = 0
|
||
|
for word in analysis:
|
||
|
if 'analysis' in word and 'жен' in word['analysis'][0]['gr'] and 'S' in word['analysis'][0]['gr']:
|
||
|
feminine_nouns += 1
|
||
|
return feminine_nouns
|
||
|
|
||
|
|
||
|
def find_significant_bigrams(analysis):
|
||
|
bigrams = defaultdict(int)
|
||
|
for i in range(len(analysis) - 1):
|
||
|
word1 = analysis[i]['analysis'][0]['lex']
|
||
|
word2 = analysis[i + 1]['analysis'][0]['lex']
|
||
|
bigrams[(word1, word2)] += 1
|
||
|
significant_bigrams = []
|
||
|
for bigram, count in bigrams.items():
|
||
|
word1_count = sum(1 for word in analysis if word['analysis'][0]['lex'] == bigram[0])
|
||
|
word2_count = sum(1 for word in analysis if word['analysis'][0]['lex'] == bigram[1])
|
||
|
expected_count = (word1_count * word2_count) / len(analysis)
|
||
|
mi = count * math.log(count / expected_count, 2)
|
||
|
significant_bigrams.append((bigram, mi))
|
||
|
return sorted(significant_bigrams, key=lambda x: x[1], reverse=True)
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
text = open('input.txt', encoding='utf8').read()
|
||
|
start = time.time()
|
||
|
analysis = filter_stop_words(text)
|
||
|
print(f"{time.time() - start} sec.")
|
||
|
print(count_feminine_nouns(analysis))
|
||
|
print(find_significant_bigrams(analysis))
|