MII_Labs_Mochalov_PI-33/lab13/main.py

48 lines
1.6 KiB
Python
Raw Permalink Normal View History

2024-05-11 05:36:26 +04:00
import math
import time
import pymystem3
from collections import defaultdict
def filter_stop_words(text):
m = pymystem3.Mystem()
analysis = m.analyze(text)
filtered_words = [word for word in analysis if
'analysis' in word and word['analysis'] and word['analysis'][0]['gr'] not in ['PR', 'INTJ', 'NUM', 'PART']]
return filtered_words
def count_feminine_nouns(analysis):
feminine_nouns = 0
for word in analysis:
if 'analysis' in word and 'жен' in word['analysis'][0]['gr'] and 'S' in word['analysis'][0]['gr']:
feminine_nouns += 1
return feminine_nouns
def find_significant_bigrams(analysis):
bigrams = defaultdict(int)
for i in range(len(analysis) - 1):
word1 = analysis[i]['analysis'][0]['lex']
word2 = analysis[i + 1]['analysis'][0]['lex']
bigrams[(word1, word2)] += 1
significant_bigrams = []
for bigram, count in bigrams.items():
word1_count = sum(1 for word in analysis if word['analysis'][0]['lex'] == bigram[0])
word2_count = sum(1 for word in analysis if word['analysis'][0]['lex'] == bigram[1])
expected_count = (word1_count * word2_count) / len(analysis)
mi = count * math.log(count / expected_count, 2)
significant_bigrams.append((bigram, mi))
return sorted(significant_bigrams, key=lambda x: x[1], reverse=True)
if __name__ == '__main__':
text = open('input.txt', encoding='utf8').read()
start = time.time()
analysis = filter_stop_words(text)
print(f"{time.time() - start} sec.")
print(count_feminine_nouns(analysis))
print(find_significant_bigrams(analysis))