diff --git a/app/criteria/comparison_speech_slides/criterion.py b/app/criteria/comparison_speech_slides/criterion.py new file mode 100644 index 00000000..c328988b --- /dev/null +++ b/app/criteria/comparison_speech_slides/criterion.py @@ -0,0 +1,90 @@ +from bson import ObjectId + +from app.root_logger import get_root_logger +from app.localisation import * +from ..criterion_base import BaseCriterion +from ..criterion_result import CriterionResult +from app.audio import Audio +from app.presentation import Presentation +from app.utils import normalize_text, delete_punctuation +from ..text_comparison import SlidesSimilarityEvaluator + +logger = get_root_logger('web') + + +# Критерий, оценивающий, насколько текст слайда перекликается с речью студента на этом слайде +class ComparisonSpeechSlidesCriterion(BaseCriterion): + PARAMETERS = dict( + skip_slides=list.__name__, + ) + + def __init__(self, parameters, dependent_criteria, name=''): + super().__init__( + name=name, + parameters=parameters, + dependent_criteria=dependent_criteria, + ) + self.evaluator = SlidesSimilarityEvaluator() + if 'slide_speech_threshold' not in self.parameters: + self.parameters['slide_speech_threshold'] = 0.125 + + @property + def description(self): + return { + "Критерий": t(self.name), + "Описание": t( + "Проверяет, что текст слайда соответствует словам, которые произносит студент во время демонстрации " + "этого слайда"), + "Оценка": t("1, если среднее значение соответствия речи содержимому слайдов равно или превосходит заданного порога (от 0 до 1), " + "иначе r / значение порога, где r - среднее значение соответствия речи демонстрируемым слайдам") + } + + def skip_slide(self, current_slide_text: str) -> bool: + for skip_slide in self.parameters['skip_slides']: + if skip_slide.lower() in delete_punctuation(current_slide_text).lower(): + return True + return False + + def apply(self, audio: Audio, presentation: Presentation, training_id: ObjectId, + criteria_results: dict) -> CriterionResult: + # Результаты сравнения текстов + results = {} + + slides_to_process = [] + + for current_slide_index in range(len(audio.audio_slides)): + # Список слов, сказанных студентом на данном слайде -- список из RecognizedWord + current_slide_speech = audio.audio_slides[current_slide_index].recognized_words + # Удаление time_stamp-ов и probability, ибо работа будет вестись только со словами + current_slide_speech = list(map(lambda x: x.word.value, current_slide_speech)) + # Нормализация текста выступления + current_slide_speech = " ".join(normalize_text(current_slide_speech)) + + # Если на данном слайде ничего не сказано, то не обрабатываем данный слайд + if len(current_slide_speech.split()) == 0: + results[current_slide_index + 1] = 0.000 + continue + + # Список слов со слайда презентации + current_slide_text = presentation.slides[current_slide_index].words + # Проверяем, входит ли рассматриваемый слайд в список нерасмматриваемых + if self.skip_slide(current_slide_text): + logger.info(f"Слайд №{current_slide_index + 1} пропущен") + continue + + # Нормализация текста слайда + current_slide_text = " ".join(normalize_text(current_slide_text.split())) + slides_to_process.append((current_slide_speech, current_slide_text, current_slide_index + 1)) + + self.evaluator.train_model([" ".join(list(map(lambda x: x[0], slides_to_process))), " ".join(list(map(lambda x: x[1], slides_to_process)))]) + + for speech, slide_text, slide_number in slides_to_process: + results[slide_number] = self.evaluator.evaluate_semantic_similarity(speech, slide_text) + + results = dict(sorted(results.items())) + + score = (sum(list(results.values())) / len(list(results.values()))) / self.parameters['slide_speech_threshold'] + + return CriterionResult(1 if score >= 1 else score, "Отлично" if score >= 1 else "Следует уделить внимание " + "соотвествию речи на слайдах " + "{}".format(",\n".join([f"№{n} - {results[n]}" for n in dict(filter(lambda item: item[1] < self.parameters['slide_speech_threshold'], results.items()))]))) diff --git a/app/criteria/comparison_whole_speech/criterion.py b/app/criteria/comparison_whole_speech/criterion.py new file mode 100644 index 00000000..f0cf6227 --- /dev/null +++ b/app/criteria/comparison_whole_speech/criterion.py @@ -0,0 +1,84 @@ +from bson import ObjectId + +from app.root_logger import get_root_logger +from app.localisation import * +from ..criterion_base import BaseCriterion +from ..criterion_result import CriterionResult +from app.audio import Audio +from app.presentation import Presentation +from app.utils import normalize_text +from ..text_comparison import Doc2VecEvaluator + +logger = get_root_logger('web') + + +class ComparisonWholeSpeechCriterion(BaseCriterion): + PARAMETERS = dict( + vector_size=int.__name__, + window=int.__name__, + min_count=int.__name__, + workers=int.__name__, + epochs=int.__name__, + dm=int.__name__, + ) + + def __init__(self, parameters, dependent_criteria, name=''): + super().__init__( + name=name, + parameters=parameters, + dependent_criteria=dependent_criteria, + ) + vector_size = self.parameters['vector_size'] + window = self.parameters['window'] + min_count = self.parameters['min_count'] + workers = self.parameters['workers'] + epochs = self.parameters['epochs'] + dm = self.parameters['dm'] + + self.model = Doc2VecEvaluator(vector_size, window, min_count, workers, epochs, dm) + + @property + def description(self): + return { + "Критерий": t(self.name), + "Описание": t("Проверяет, что тема доклада студента совпадает с темой презентации"), + "Оценка": t( + "1, если тема доклада и презентации совпадают не менее, чем на 40%, иначе 2.5 * k, где k - степень соответствия темы доклада теме презентации") + } + + def apply(self, audio: Audio, presentation: Presentation, training_id: ObjectId, + criteria_results: dict) -> CriterionResult: + normalized_speech = [] + normalized_slides = [] + + for i in range(len(audio.audio_slides)): + # Список сказанных на слайде слов + current_slide_speech = audio.audio_slides[i].recognized_words + # Очистка списка от timestamp-ов и probability + current_slide_speech = list(map(lambda x: x.word.value, current_slide_speech)) + # Нормализация текста + current_slide_speech = " ".join(normalize_text(current_slide_speech)) + if current_slide_speech != "": + normalized_speech.append(current_slide_speech) + + # Текст из слайда презентации + current_slide_text = presentation.slides[i].words + # Нормализация текста слайда + current_slide_text = " ".join(normalize_text(current_slide_text.split())) + if current_slide_text != "": + normalized_slides.append(current_slide_text) + + if len(normalized_speech) == 0: + return CriterionResult(0, "Тренажер не зафиксировал, что вы что-то говорили") + normalized_speech_text = " ".join(normalized_speech) + + if len(normalized_slides) == 0: + return CriterionResult(0, "Загруженная вами презентация не содержит текста") + normalized_slides_text = " ".join(normalized_slides) + + self.model.train_model([normalized_speech_text, normalized_slides_text]) + + score = 2.5 * self.model.evaluate_semantic_similarity(normalized_speech_text, normalized_slides_text) + logger.info(f"Score={score}") + return CriterionResult(1 if score >= 1 else score, + "Ваша речь соответствует тексту презентации" if score >= 1 else "Ваша речь не полностью соответствует теме презентации") diff --git a/app/criteria/criterions.py b/app/criteria/criterions.py index 4af03365..5afd53fe 100644 --- a/app/criteria/criterions.py +++ b/app/criteria/criterions.py @@ -8,3 +8,5 @@ from .speech_is_not_in_database.criterion import SpeechIsNotInDatabaseCriterion from .speech_pace.criterion import SpeechPaceCriterion from .strict_speech_duration.criterion import StrictSpeechDurationCriterion +from .comparison_speech_slides.criterion import ComparisonSpeechSlidesCriterion +from .comparison_whole_speech.criterion import ComparisonWholeSpeechCriterion diff --git a/app/criteria/preconfigured_criterions.py b/app/criteria/preconfigured_criterions.py index 89da1f19..e92bb837 100644 --- a/app/criteria/preconfigured_criterions.py +++ b/app/criteria/preconfigured_criterions.py @@ -9,10 +9,11 @@ from criteria import (FillersNumberCriterion, FillersRatioCriterion, SpeechIsNotInDatabaseCriterion, SpeechPaceCriterion, - StrictSpeechDurationCriterion) + StrictSpeechDurationCriterion, ComparisonSpeechSlidesCriterion, + ComparisonWholeSpeechCriterion) from .utils import DEFAULT_FILLERS - +from .utils import DEFAULT_SKIP_SLIDES preconfigured_criterions = [ # SpeechDurationCriterion @@ -143,7 +144,27 @@ } }, dependent_criteria=[], + ), + + ComparisonSpeechSlidesCriterion( + name="ComparisonSpeechSlidesCriterion", + parameters={"skip_slides": DEFAULT_SKIP_SLIDES}, + dependent_criteria=[], + ), + + ComparisonWholeSpeechCriterion( + name="ComparisonWholeSpeechCriterion", + parameters={ + "vector_size": 200, + "window": 5, + "min_count": 3, + "workers": 4, + "epochs": 40, + "dm": 0 + }, + dependent_criteria=[], ) + ] diff --git a/app/criteria/text_comparison.py b/app/criteria/text_comparison.py new file mode 100644 index 00000000..2edd2258 --- /dev/null +++ b/app/criteria/text_comparison.py @@ -0,0 +1,37 @@ +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.metrics.pairwise import cosine_similarity +from gensim.models.doc2vec import Doc2Vec, TaggedDocument + + +class SlidesSimilarityEvaluator: + def __init__(self): + self.vectorizer = TfidfVectorizer(ngram_range=(1, 1)) + + def train_model(self, corpus: list): + self.vectorizer.fit(corpus) + + def evaluate_semantic_similarity(self, text1: str, text2: str) -> float: + vector1 = self.vectorizer.transform([text1]) + vector2 = self.vectorizer.transform([text2]) + similarity = cosine_similarity(vector1, vector2)[0][0] + + return round(similarity, 3) + + +class Doc2VecEvaluator: + def __init__(self, vector_size: int, window: int, min_count: int, workers: int, epochs: int, dm: int): + self.model = Doc2Vec(vector_size=vector_size, window=window, min_count=min_count, workers=workers, + epochs=epochs, dm=dm) + + def train_model(self, documents: list): + tagged_documents = [TaggedDocument(words=doc.split(), tags=[i]) for i, doc in enumerate(documents)] + self.model.build_vocab(tagged_documents) + self.model.train(tagged_documents, total_examples=self.model.corpus_count, epochs=self.model.epochs) + + def evaluate_semantic_similarity(self, text1: str, text2: str) -> float: + text1 = text1.split() + text2 = text2.split() + + similarity = self.model.wv.n_similarity(text1, text2) + + return round(similarity, 3) \ No newline at end of file diff --git a/app/criteria/utils.py b/app/criteria/utils.py index 910e04ad..d7aeb059 100644 --- a/app/criteria/utils.py +++ b/app/criteria/utils.py @@ -3,7 +3,6 @@ import traceback from typing import Optional, Callable - from app.audio import Audio from app.utils import get_types @@ -84,6 +83,10 @@ def get_fillers_number(fillers: list, audio: Audio) -> int: return sum(map(len, get_fillers(fillers, audio))) +DEFAULT_SKIP_SLIDES = [ + "Спасибо за внимание", +] + DEFAULT_FILLERS = [ 'короче', 'однако', diff --git a/app/criteria_pack/preconfigured_pack.py b/app/criteria_pack/preconfigured_pack.py index 37f08fd4..22e62bd3 100644 --- a/app/criteria_pack/preconfigured_pack.py +++ b/app/criteria_pack/preconfigured_pack.py @@ -32,7 +32,10 @@ ['DEFAULT_FILLERS_RATIO_CRITERION', 0.33]], 'SlidesCheckerPack': [['SimpleNumberSlidesCriterion', 0.05], - ['SlidesCheckerCriterion', 0.95]] + ['SlidesCheckerCriterion', 0.95]], + 'ComparisonPack': + [['ComparisonSpeechSlidesCriterion', 0.5], + ['ComparisonWholeSpeechCriterion', 0.5]] } diff --git a/app/feedback_evaluator.py b/app/feedback_evaluator.py index 6aa22493..d29050c9 100644 --- a/app/feedback_evaluator.py +++ b/app/feedback_evaluator.py @@ -1,7 +1,7 @@ import json from app.criteria import SpeechDurationCriterion, SpeechPaceCriterion, FillersRatioCriterion, FillersNumberCriterion, \ - StrictSpeechDurationCriterion + StrictSpeechDurationCriterion, ComparisonSpeechSlidesCriterion, ComparisonWholeSpeechCriterion class Feedback: diff --git a/app/presentation_parser/slide_splitter.py b/app/presentation_parser/slide_splitter.py index c402c180..efcc1568 100644 --- a/app/presentation_parser/slide_splitter.py +++ b/app/presentation_parser/slide_splitter.py @@ -1,7 +1,5 @@ import fitz import pymorphy2 -import nltk -nltk.download('stopwords') from nltk.corpus import stopwords import os diff --git a/app/training_processor.py b/app/training_processor.py index dd17dc3e..c7abc4c6 100644 --- a/app/training_processor.py +++ b/app/training_processor.py @@ -87,6 +87,10 @@ def run(self): if __name__ == "__main__": + import nltk + nltk.download('stopwords') + nltk.download('punkt') + Config.init_config(sys.argv[1]) training_processor = TrainingProcessor() training_processor.run() diff --git a/app/utils.py b/app/utils.py index cc2d39b8..79bf3795 100644 --- a/app/utils.py +++ b/app/utils.py @@ -1,4 +1,6 @@ import os +import string +import re import tempfile from distutils.util import strtobool from threading import Timer @@ -7,6 +9,8 @@ from bson import ObjectId from flask import json import magic +import pymorphy2 +from nltk.corpus import stopwords from pydub import AudioSegment import subprocess @@ -16,11 +20,11 @@ SECONDS_PER_MINUTE = 60 BYTES_PER_MEGABYTE = 1024 * 1024 ALLOWED_MIMETYPES = { - 'pdf': ['application/pdf'], - 'ppt': ['application/vnd.ms-powerpoint'], - 'odp': ['application/vnd.oasis.opendocument.presentation'], - 'pptx': ['application/vnd.openxmlformats-officedocument.presentationml.presentation', 'application/zip'] - } + 'pdf': ['application/pdf'], + 'ppt': ['application/vnd.ms-powerpoint'], + 'odp': ['application/vnd.oasis.opendocument.presentation'], + 'pptx': ['application/vnd.openxmlformats-officedocument.presentationml.presentation', 'application/zip'] +} CONVERTIBLE_EXTENSIONS = ('ppt', 'pptx', 'odp') ALLOWED_EXTENSIONS = set(ALLOWED_MIMETYPES.keys()) DEFAULT_EXTENSION = 'pdf' @@ -74,7 +78,7 @@ def convert_to_pdf(presentation_file): temp_file.write(presentation_file.read()) temp_file.close() presentation_file.seek(0) - + converted_file = None convert_cmd = f"soffice --headless --convert-to pdf --outdir {os.path.dirname(temp_file.name)} {temp_file.name}" if run_process(convert_cmd).returncode == 0: @@ -136,9 +140,9 @@ def check_argument_is_convertible_to_object_id(arg): return {'message': '{} cannot be converted to ObjectId. {}: {}'.format(arg, e1.__class__, e1)}, 404 except Exception as e2: return { - 'message': 'Some arguments cannot be converted to ObjectId or to str. {}: {}.' - .format(e2.__class__, e2) - }, 404 + 'message': 'Some arguments cannot be converted to ObjectId or to str. {}: {}.' + .format(e2.__class__, e2) + }, 404 def check_arguments_are_convertible_to_object_id(f): @@ -182,6 +186,29 @@ def check_dict_keys(dictionary, keys): return f"{msg}\n{dictionary}" if msg else '' +# Функция нормализации текста +def normalize_text(text: list) -> list: + table = str.maketrans("", "", string.punctuation) + morph = pymorphy2.MorphAnalyzer() + + # Замена знаков препинания на пустые строки, конвертация в нижний регистр и обрезание пробелов по краям + text = list(map(lambda x: x.translate(table).lower().strip(), text)) + # Замена цифр и слов не на русском языке на пустые строки + text = list(map(lambda x: re.sub(r'[^А-яёЁ\s]', '', x), text)) + # Удаление пустых строк + text = list(filter(lambda x: x.isalpha(), text)) + # Приведение слов к нормальной форме + text = list(map(lambda x: morph.normal_forms(x)[0], text)) + # Очистка от стоп-слов + text = list(filter(lambda x: x not in RussianStopwords().words, text)) + return text + + +# Удаление пунктуации из текста +def delete_punctuation(text: str) -> str: + return text.translate(str.maketrans('', '', string.punctuation + "\t\n\r\v\f")) + + class RepeatedTimer: """ Utility class to call a function with a given interval between the end and the beginning of consecutive calls @@ -210,3 +237,18 @@ def start(self): def stop(self): self._timer.cancel() self.is_running = False + + +class Singleton(type): + _instances = {} + + def __call__(cls, *args, **kwargs): + if cls not in cls._instances: + cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs) + return cls._instances[cls] + + +class RussianStopwords(metaclass=Singleton): + + def __init__(self): + self.words = stopwords.words('russian') diff --git a/docker-compose.yml b/docker-compose.yml index b4569d4e..303a30ed 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -38,6 +38,8 @@ services: restart: always depends_on: - db + volumes: + - nltk_data:/root/nltk_data recognized_presentation_processor: image: wst-image:v0.2 @@ -55,6 +57,8 @@ services: - db - recognized_audio_processor - recognized_presentation_processor + volumes: + - nltk_data:/root/nltk_data task_attempt_to_pass_back_processor: image: wst-image:v0.2 @@ -84,3 +88,4 @@ services: volumes: whisper_models: + nltk_data: diff --git a/requirements.txt b/requirements.txt index ffcf3ef2..e672ccb8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ Flask-Reuploaded ==1.4.0 PyMuPDF ==1.23.26 -flask ==3.0.2 +flask ==3.0.2 fuzzywuzzy ==0.18.0 librosa ==0.10.1 lti ==0.9.5 @@ -26,3 +26,5 @@ requests ==2.27.1 scipy ==1.12.0 ua-parser ==0.18 vext ==0.7.6 +scikit-learn ==1.4.2 +gensim ==4.3.2