Skip to content

Commit 63eaaaa

Browse files
Merge pull request #406 from OSLL/new_criteria_compare_whole_texts
392 Compare student speech whole text
2 parents 02a5203 + a1706b0 commit 63eaaaa

File tree

13 files changed

+308
-17
lines changed

13 files changed

+308
-17
lines changed
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
from bson import ObjectId
2+
3+
from app.root_logger import get_root_logger
4+
from app.localisation import *
5+
from ..criterion_base import BaseCriterion
6+
from ..criterion_result import CriterionResult
7+
from app.audio import Audio
8+
from app.presentation import Presentation
9+
from app.utils import normalize_text, delete_punctuation
10+
from ..text_comparison import SlidesSimilarityEvaluator
11+
12+
logger = get_root_logger('web')
13+
14+
15+
# Критерий, оценивающий, насколько текст слайда перекликается с речью студента на этом слайде
16+
class ComparisonSpeechSlidesCriterion(BaseCriterion):
17+
PARAMETERS = dict(
18+
skip_slides=list.__name__,
19+
)
20+
21+
def __init__(self, parameters, dependent_criteria, name=''):
22+
super().__init__(
23+
name=name,
24+
parameters=parameters,
25+
dependent_criteria=dependent_criteria,
26+
)
27+
self.evaluator = SlidesSimilarityEvaluator()
28+
if 'slide_speech_threshold' not in self.parameters:
29+
self.parameters['slide_speech_threshold'] = 0.125
30+
31+
@property
32+
def description(self):
33+
return {
34+
"Критерий": t(self.name),
35+
"Описание": t(
36+
"Проверяет, что текст слайда соответствует словам, которые произносит студент во время демонстрации "
37+
"этого слайда"),
38+
"Оценка": t("1, если среднее значение соответствия речи содержимому слайдов равно или превосходит заданного порога (от 0 до 1), "
39+
"иначе r / значение порога, где r - среднее значение соответствия речи демонстрируемым слайдам")
40+
}
41+
42+
def skip_slide(self, current_slide_text: str) -> bool:
43+
for skip_slide in self.parameters['skip_slides']:
44+
if skip_slide.lower() in delete_punctuation(current_slide_text).lower():
45+
return True
46+
return False
47+
48+
def apply(self, audio: Audio, presentation: Presentation, training_id: ObjectId,
49+
criteria_results: dict) -> CriterionResult:
50+
# Результаты сравнения текстов
51+
results = {}
52+
53+
slides_to_process = []
54+
55+
for current_slide_index in range(len(audio.audio_slides)):
56+
# Список слов, сказанных студентом на данном слайде -- список из RecognizedWord
57+
current_slide_speech = audio.audio_slides[current_slide_index].recognized_words
58+
# Удаление time_stamp-ов и probability, ибо работа будет вестись только со словами
59+
current_slide_speech = list(map(lambda x: x.word.value, current_slide_speech))
60+
# Нормализация текста выступления
61+
current_slide_speech = " ".join(normalize_text(current_slide_speech))
62+
63+
# Если на данном слайде ничего не сказано, то не обрабатываем данный слайд
64+
if len(current_slide_speech.split()) == 0:
65+
results[current_slide_index + 1] = 0.000
66+
continue
67+
68+
# Список слов со слайда презентации
69+
current_slide_text = presentation.slides[current_slide_index].words
70+
# Проверяем, входит ли рассматриваемый слайд в список нерасмматриваемых
71+
if self.skip_slide(current_slide_text):
72+
logger.info(f"Слайд №{current_slide_index + 1} пропущен")
73+
continue
74+
75+
# Нормализация текста слайда
76+
current_slide_text = " ".join(normalize_text(current_slide_text.split()))
77+
slides_to_process.append((current_slide_speech, current_slide_text, current_slide_index + 1))
78+
79+
self.evaluator.train_model([" ".join(list(map(lambda x: x[0], slides_to_process))), " ".join(list(map(lambda x: x[1], slides_to_process)))])
80+
81+
for speech, slide_text, slide_number in slides_to_process:
82+
results[slide_number] = self.evaluator.evaluate_semantic_similarity(speech, slide_text)
83+
84+
results = dict(sorted(results.items()))
85+
86+
score = (sum(list(results.values())) / len(list(results.values()))) / self.parameters['slide_speech_threshold']
87+
88+
return CriterionResult(1 if score >= 1 else score, "Отлично" if score >= 1 else "Следует уделить внимание "
89+
"соотвествию речи на слайдах "
90+
"{}".format(",\n".join([f"№{n} - {results[n]}" for n in dict(filter(lambda item: item[1] < self.parameters['slide_speech_threshold'], results.items()))])))
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
from bson import ObjectId
2+
3+
from app.root_logger import get_root_logger
4+
from app.localisation import *
5+
from ..criterion_base import BaseCriterion
6+
from ..criterion_result import CriterionResult
7+
from app.audio import Audio
8+
from app.presentation import Presentation
9+
from app.utils import normalize_text
10+
from ..text_comparison import Doc2VecEvaluator
11+
12+
logger = get_root_logger('web')
13+
14+
15+
class ComparisonWholeSpeechCriterion(BaseCriterion):
16+
PARAMETERS = dict(
17+
vector_size=int.__name__,
18+
window=int.__name__,
19+
min_count=int.__name__,
20+
workers=int.__name__,
21+
epochs=int.__name__,
22+
dm=int.__name__,
23+
)
24+
25+
def __init__(self, parameters, dependent_criteria, name=''):
26+
super().__init__(
27+
name=name,
28+
parameters=parameters,
29+
dependent_criteria=dependent_criteria,
30+
)
31+
vector_size = self.parameters['vector_size']
32+
window = self.parameters['window']
33+
min_count = self.parameters['min_count']
34+
workers = self.parameters['workers']
35+
epochs = self.parameters['epochs']
36+
dm = self.parameters['dm']
37+
38+
self.model = Doc2VecEvaluator(vector_size, window, min_count, workers, epochs, dm)
39+
40+
@property
41+
def description(self):
42+
return {
43+
"Критерий": t(self.name),
44+
"Описание": t("Проверяет, что тема доклада студента совпадает с темой презентации"),
45+
"Оценка": t(
46+
"1, если тема доклада и презентации совпадают не менее, чем на 40%, иначе 2.5 * k, где k - степень соответствия темы доклада теме презентации")
47+
}
48+
49+
def apply(self, audio: Audio, presentation: Presentation, training_id: ObjectId,
50+
criteria_results: dict) -> CriterionResult:
51+
normalized_speech = []
52+
normalized_slides = []
53+
54+
for i in range(len(audio.audio_slides)):
55+
# Список сказанных на слайде слов
56+
current_slide_speech = audio.audio_slides[i].recognized_words
57+
# Очистка списка от timestamp-ов и probability
58+
current_slide_speech = list(map(lambda x: x.word.value, current_slide_speech))
59+
# Нормализация текста
60+
current_slide_speech = " ".join(normalize_text(current_slide_speech))
61+
if current_slide_speech != "":
62+
normalized_speech.append(current_slide_speech)
63+
64+
# Текст из слайда презентации
65+
current_slide_text = presentation.slides[i].words
66+
# Нормализация текста слайда
67+
current_slide_text = " ".join(normalize_text(current_slide_text.split()))
68+
if current_slide_text != "":
69+
normalized_slides.append(current_slide_text)
70+
71+
if len(normalized_speech) == 0:
72+
return CriterionResult(0, "Тренажер не зафиксировал, что вы что-то говорили")
73+
normalized_speech_text = " ".join(normalized_speech)
74+
75+
if len(normalized_slides) == 0:
76+
return CriterionResult(0, "Загруженная вами презентация не содержит текста")
77+
normalized_slides_text = " ".join(normalized_slides)
78+
79+
self.model.train_model([normalized_speech_text, normalized_slides_text])
80+
81+
score = 2.5 * self.model.evaluate_semantic_similarity(normalized_speech_text, normalized_slides_text)
82+
logger.info(f"Score={score}")
83+
return CriterionResult(1 if score >= 1 else score,
84+
"Ваша речь соответствует тексту презентации" if score >= 1 else "Ваша речь не полностью соответствует теме презентации")

app/criteria/criterions.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,5 @@
88
from .speech_is_not_in_database.criterion import SpeechIsNotInDatabaseCriterion
99
from .speech_pace.criterion import SpeechPaceCriterion
1010
from .strict_speech_duration.criterion import StrictSpeechDurationCriterion
11+
from .comparison_speech_slides.criterion import ComparisonSpeechSlidesCriterion
12+
from .comparison_whole_speech.criterion import ComparisonWholeSpeechCriterion

app/criteria/preconfigured_criterions.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,11 @@
99

1010
from criteria import (FillersNumberCriterion, FillersRatioCriterion,
1111
SpeechIsNotInDatabaseCriterion, SpeechPaceCriterion,
12-
StrictSpeechDurationCriterion)
12+
StrictSpeechDurationCriterion, ComparisonSpeechSlidesCriterion,
13+
ComparisonWholeSpeechCriterion)
1314

1415
from .utils import DEFAULT_FILLERS
15-
16+
from .utils import DEFAULT_SKIP_SLIDES
1617

1718
preconfigured_criterions = [
1819
# SpeechDurationCriterion
@@ -143,7 +144,27 @@
143144
}
144145
},
145146
dependent_criteria=[],
147+
),
148+
149+
ComparisonSpeechSlidesCriterion(
150+
name="ComparisonSpeechSlidesCriterion",
151+
parameters={"skip_slides": DEFAULT_SKIP_SLIDES},
152+
dependent_criteria=[],
153+
),
154+
155+
ComparisonWholeSpeechCriterion(
156+
name="ComparisonWholeSpeechCriterion",
157+
parameters={
158+
"vector_size": 200,
159+
"window": 5,
160+
"min_count": 3,
161+
"workers": 4,
162+
"epochs": 40,
163+
"dm": 0
164+
},
165+
dependent_criteria=[],
146166
)
167+
147168
]
148169

149170

app/criteria/text_comparison.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
from sklearn.feature_extraction.text import TfidfVectorizer
2+
from sklearn.metrics.pairwise import cosine_similarity
3+
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
4+
5+
6+
class SlidesSimilarityEvaluator:
7+
def __init__(self):
8+
self.vectorizer = TfidfVectorizer(ngram_range=(1, 1))
9+
10+
def train_model(self, corpus: list):
11+
self.vectorizer.fit(corpus)
12+
13+
def evaluate_semantic_similarity(self, text1: str, text2: str) -> float:
14+
vector1 = self.vectorizer.transform([text1])
15+
vector2 = self.vectorizer.transform([text2])
16+
similarity = cosine_similarity(vector1, vector2)[0][0]
17+
18+
return round(similarity, 3)
19+
20+
21+
class Doc2VecEvaluator:
22+
def __init__(self, vector_size: int, window: int, min_count: int, workers: int, epochs: int, dm: int):
23+
self.model = Doc2Vec(vector_size=vector_size, window=window, min_count=min_count, workers=workers,
24+
epochs=epochs, dm=dm)
25+
26+
def train_model(self, documents: list):
27+
tagged_documents = [TaggedDocument(words=doc.split(), tags=[i]) for i, doc in enumerate(documents)]
28+
self.model.build_vocab(tagged_documents)
29+
self.model.train(tagged_documents, total_examples=self.model.corpus_count, epochs=self.model.epochs)
30+
31+
def evaluate_semantic_similarity(self, text1: str, text2: str) -> float:
32+
text1 = text1.split()
33+
text2 = text2.split()
34+
35+
similarity = self.model.wv.n_similarity(text1, text2)
36+
37+
return round(similarity, 3)

app/criteria/utils.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import traceback
44
from typing import Optional, Callable
55

6-
76
from app.audio import Audio
87
from app.utils import get_types
98

@@ -84,6 +83,10 @@ def get_fillers_number(fillers: list, audio: Audio) -> int:
8483
return sum(map(len, get_fillers(fillers, audio)))
8584

8685

86+
DEFAULT_SKIP_SLIDES = [
87+
"Спасибо за внимание",
88+
]
89+
8790
DEFAULT_FILLERS = [
8891
'короче',
8992
'однако',

app/criteria_pack/preconfigured_pack.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,10 @@
3232
['DEFAULT_FILLERS_RATIO_CRITERION', 0.33]],
3333
'SlidesCheckerPack':
3434
[['SimpleNumberSlidesCriterion', 0.05],
35-
['SlidesCheckerCriterion', 0.95]]
35+
['SlidesCheckerCriterion', 0.95]],
36+
'ComparisonPack':
37+
[['ComparisonSpeechSlidesCriterion', 0.5],
38+
['ComparisonWholeSpeechCriterion', 0.5]]
3639
}
3740

3841

app/feedback_evaluator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import json
22

33
from app.criteria import SpeechDurationCriterion, SpeechPaceCriterion, FillersRatioCriterion, FillersNumberCriterion, \
4-
StrictSpeechDurationCriterion
4+
StrictSpeechDurationCriterion, ComparisonSpeechSlidesCriterion, ComparisonWholeSpeechCriterion
55

66

77
class Feedback:

app/presentation_parser/slide_splitter.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
import fitz
22
import pymorphy2
3-
import nltk
4-
nltk.download('stopwords')
53
from nltk.corpus import stopwords
64

75
import os

app/training_processor.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,10 @@ def run(self):
8787

8888

8989
if __name__ == "__main__":
90+
import nltk
91+
nltk.download('stopwords')
92+
nltk.download('punkt')
93+
9094
Config.init_config(sys.argv[1])
9195
training_processor = TrainingProcessor()
9296
training_processor.run()

0 commit comments

Comments
 (0)