moevm · Toropigeon · Jan 19, 2024 · Jan 19, 2024 · Mar 15, 2024 · Mar 15, 2024
diff --git a/app/main/check_packs/pack_config.py b/app/main/check_packs/pack_config.py
@@ -39,7 +39,9 @@
     ["header_check"],
     ["report_section_component"],
     ["main_text_check"],
-    ["spelling_check"]
+    ["spelling_check"],
+    ["compare_goal_and_content_check"],
+    ["compare_tasks_and_content_check"]
 ]
 
 DEFAULT_TYPE = 'pres'

diff --git a/app/main/checks/__init__.py b/app/main/checks/__init__.py
@@ -37,6 +37,8 @@
         ReportChapters.id: ReportChapters,
         ReportSectionComponent.id: ReportSectionComponent,
         ReportMainTextCheck.id: ReportMainTextCheck,
-        SpellingCheck.id: SpellingCheck
+        SpellingCheck.id: SpellingCheck,
+        CompareGoalAndContentCheck.id: CompareGoalAndContentCheck,
+        CompareTasksAndContentCheck.id: CompareTasksAndContentCheck
     }
 }
diff --git a/app/main/checks/report_checks/__init__.py b/app/main/checks/report_checks/__init__.py
@@ -21,4 +21,5 @@
 from .sections_check import LRReportSectionCheck
 from .style_check import ReportStyleCheck
 from .spelling_check import SpellingCheck
-
+from .compare_goal_and_content import CompareGoalAndContentCheck
+from .compare_tasks_and_content import CompareTasksAndContentCheck
diff --git a/app/main/checks/report_checks/compare_goal_and_content.py b/app/main/checks/report_checks/compare_goal_and_content.py
@@ -0,0 +1,86 @@
+from ..base_check import BaseReportCriterion, answer
+
+import app.nlp.text_similarity as ts
+
+
+class CompareGoalAndContentCheck(BaseReportCriterion):
+    description = "Проверка соответствия цели и содержания"
+    id = 'compare_goal_and_content_check'
+
+    def __init__(self, file_info):
+        super().__init__(file_info)
+        self.headers = []
+        self.goal = ""
+        self.chapters = {}
+        self.weights = {}
+        self.to_pass = 0
+        self.to_ignore = []
+
+    def late_init(self):
+        self.headers = self.file.make_chapters(self.file_type['report_type'])
+        self.weights = {
+            "ВВЕДЕНИЕ": 1,
+            "1": 2,
+            "2": 2,
+            "3": 5,
+            "4": 2,
+            "5": 1,
+            "ЗАКЛЮЧЕНИЕ": 1
+        }
+        self.to_pass = 0.1
+        self.to_ignore = ["СПИСОК ИСПОЛЬЗОВАННЫХ ИСТОЧНИКОВ", "ПРИЛОЖЕНИЕ"]
+
+    def check(self):
+        self.late_init()
+        if self.file.page_counter() < 4:
+            return answer(False, "В отчете недостаточно страниц. Нечего проверять.")
+        result = ""
+        intro_text = ""
+        for header in self.headers:
+            if header["text"] == "ВВЕДЕНИЕ":
+                for child in header["child"]:
+                    intro_text += child["text"]
+        goal_index = intro_text.find("Цель")
+        if goal_index > 0:
+            goal_start = goal_index + len("Цель") + 1
+            goal_end = intro_text.find(".", goal_start)
+            self.goal = intro_text[goal_start:goal_end]
+        else:
+            return answer(False, "В введении не найдена цель работы")
+        for header in self.headers:
+            if any(ignore_phrase in header["text"] for ignore_phrase in self.to_ignore):
+                continue
+            text = ""
+            for child in header["child"]:
+                text += child['text']
+            self.chapters[header["text"]] = text
+        self.chapters = {k: v for k, v in self.chapters.items() if v and v.strip()}
+        NLPProcessor = ts.NLPProcessor()
+        calculate_result = NLPProcessor.calculate_cosine_similarity(self.goal, self.chapters)
+        max_result = max(calculate_result.values())
+        for k, v in calculate_result.items():
+            for chapter, weight in self.weights.items():
+                if k.find(chapter) == 0:
+                    calculate_result[k] = v * weight
+                    break
+            calculate_result[k] = calculate_result[k] / max_result
+        avg = round(sum(calculate_result.values()) / len(calculate_result.values()), 3)
+        if avg < self.to_pass:
+            return answer(False,
+                          f"Цель недостаточно раскрыта в содержании (нужно {self.to_pass * 100}%, набрано {avg * 100}%)")
+        result += f"<br><b>Тема раскрыта на {avg * 100}%</b><br>"
+        sorted_chapters = dict(sorted(calculate_result.items(), key=lambda item: item[1], reverse=True))
+        result += f"<br><b>7 разделов, наиболее раскрывающих тему:</b><br>"
+        for i, key in enumerate(sorted_chapters.keys()):
+            if i >= 7:
+                break
+            result += f"<br>\"{key}\", {round(self.__output(sorted_chapters[key], sum(sorted_chapters.values())), 3)}% текста раскрывают тему<br>"
+        result += f"<br><b>7 разделов, наименее раскрывающих тему:</b><br>"
+        for i, key in enumerate(sorted_chapters.keys()):
+            if i < len(sorted_chapters) - 7:
+                continue
+            result += f"<br>\"{key}\", {self.__output(sorted_chapters[key], sum(sorted_chapters.values()))}% текста раскрывают тему<br>"
+        return answer(True, result)
+
+    def __output(self, value, summ):
+        return round(value / summ, 3) * 100
diff --git a/app/main/checks/report_checks/compare_tasks_and_content.py b/app/main/checks/report_checks/compare_tasks_and_content.py
@@ -0,0 +1,100 @@
+from ..base_check import BaseReportCriterion, answer
+
+import app.nlp.text_similarity as ts
+
+
+class CompareTasksAndContentCheck(BaseReportCriterion):
+    description = "Проверка соответствия задач и содержания"
+    id = 'compare_tasks_and_content_check'
+
+    def __init__(self, file_info):
+        super().__init__(file_info)
+        self.headers = []
+        self.tasks = []
+        self.chapters = {}
+        self.weights = {}
+        self.all_to_pass = 0
+        self.specific_to_pass = 0
+        self.to_ignore = []
+
+    def late_init(self):
+        self.headers = self.file.make_chapters(self.file_type['report_type'])
+        self.weights = {
+            "ВВЕДЕНИЕ": 1,
+            "1": 2,
+            "2": 2,
+            "3": 5,
+            "4": 2,
+            "5": 1,
+            "ЗАКЛЮЧЕНИЕ": 1
+        }
+        self.all_to_pass = 0.15
+        self.specific_to_pass = 0.05
+        self.to_ignore = ["СПИСОК ИСПОЛЬЗОВАННЫХ ИСТОЧНИКОВ", "ПРИЛОЖЕНИЕ"]
+
+    def check(self):
+        self.late_init()
+        if self.file.page_counter() < 4:
+            return answer(False, "В отчете недостаточно страниц. Нечего проверять.")
+        result = ""
+        possible_tasks = []
+        for header in self.headers:
+            if header["text"] == "ВВЕДЕНИЕ":
+                for i, child in enumerate(header["child"]):
+                    if child["text"].lower().find("задачи") >= 0:
+                        possible_tasks.append(i)
+                    if child["text"].lower().find("объект") >= 0 and child["text"].lower().find("исследования") > 0:
+                        if not possible_tasks:
+                            return answer(False, "В введении не найдены задачи работы")
+                        tasks = header["child"][max(possible_tasks) + 1:i]
+                        self.tasks = [task["text"] for task in tasks]
+                        break
+            if any(ignore_phrase in header["text"] for ignore_phrase in self.to_ignore):
+                continue
+            text = ""
+            for child in header["child"]:
+                text += child['text']
+            self.chapters[header["text"]] = text
+        self.chapters = {k: v for k, v in self.chapters.items() if v and v.strip()}
+        NLPProcessor = ts.NLPProcessor()
+        all_tasks_result = NLPProcessor.calculate_cosine_similarity(" ".join(self.tasks), self.chapters)
+        max_result = max(all_tasks_result.values())
+        for k, v in all_tasks_result.items():
+            for chapter, weight in self.weights.items():
+                if k.find(chapter) == 0:
+                    all_tasks_result[k] = v * weight
+                    break
+            all_tasks_result[k] = round(all_tasks_result[k] / max_result, 3)
+        avg = round(sum(all_tasks_result.values()) / len(all_tasks_result.values()), 3)
+        if avg < self.all_to_pass:
+            return answer(False, f"Задачи недостаточно раскрыты в содержании (нужно {self.all_to_pass * 100}%, набрано {avg * 100}%)")
+        result += f"<br><b>Задачи раскрыты на {avg * 100}%</b><br>"
+        for task in self.tasks:
+            cur_task = NLPProcessor.calculate_cosine_similarity(task, self.chapters)
+            max_result = max(cur_task.values())
+            for k, v in cur_task.items():
+                for chapter, weight in self.weights.items():
+                    if k.find(chapter) == 0:
+                        cur_task[k] = v * weight
+                        break
+                cur_task[k] = cur_task[k] / max_result
+            sorted_chapters = dict(sorted(cur_task.items(), key=lambda item: item[1], reverse=True))
+            specific_avg = sum(sorted_chapters.values()) / len(sorted_chapters.values())
+            specific_avg = round(specific_avg, 3)
+            if specific_avg < self.specific_to_pass:
+                return answer(False, f"<br>Задача \"{task}\" недостаточно раскрыта<br>")
+            result += f"<br><b>Задача \"{task}\" раскрыта на {round(specific_avg * 100, 2)}%</b><br><br>Задачу \"{task}\" наиболее раскрывают разделы: <br>"
+            for i, key in enumerate(sorted_chapters.keys()):
+                if i >= 3:
+                    break
+                result += f"<br>\"{key}\", {round(self.__output(sorted_chapters[key], sum(sorted_chapters.values())), 3)}% текста раскрывают задачу<br>"
+        all_tasks_result = dict(sorted(all_tasks_result.items(), key=lambda item: item[1], reverse=True))
+        result += f"<br><b>Разделы, наименее раскрывающие задачи:</b><br>"
+        for i, key in enumerate(all_tasks_result.keys()):
+            if i < len(all_tasks_result.keys()) - 5:
+                continue
+            result += f"<br>{key}: {round(all_tasks_result[key], 3) * 100}%<br>"
+        return answer(True, result)
+
+    def __output(self, value, summ):
+        return (value / summ) * 100
diff --git a/app/nlp/text_similarity.py b/app/nlp/text_similarity.py
@@ -0,0 +1,97 @@
+from collections import defaultdict
+
+import nltk
+import numpy as np
+from nltk import word_tokenize
+from nltk.corpus import stopwords
+from nltk.stem import SnowballStemmer
+from nltk.util import ngrams
+import string
+
+
+class NLPProcessor:
+    def __init__(self, language='russian'):
+        nltk.download('punkt')
+        nltk.download('stopwords')
+        self.stop_words = set(stopwords.words(language))
+        self.stemmer = SnowballStemmer(language)
+
+    def preprocessing(self, text):
+        text = text.translate(str.maketrans('', '', string.punctuation))
+        tokens = word_tokenize(text)
+        tokens = [word for word in tokens if word.lower() not in self.stop_words]
+        return [self.stemmer.stem(token) for token in tokens]
+
+    def get_ngrams(self, tokens, n=2):
+        result = []
+        for i in range(n):
+            n_grams = ngrams(tokens, i + 1)
+            result.extend([' '.join(grams) for grams in n_grams])
+        return result
+
+    def get_bag_of_n_gramms(self, corpus):
+        new_corpus = []
+        for item in corpus:
+            for n_gramm in item:
+                new_corpus.append(n_gramm)
+        index_word = {}
+        i = 0
+        for word in new_corpus:
+            if word in index_word.keys():
+                continue
+            index_word[word] = i
+            i += 1
+        return index_word
+
+    def get_vector_by_BOW(self, bag_of_ngramms, doc, docs):
+        def tf(word, doc):
+            return doc.count(word) / len(doc)
+
+        def idf(word, docs):
+            word_in_docs = 0
+            for item in docs:
+                if word in item:
+                    word_in_docs += 1
+            return np.log10(len(docs) / (word_in_docs + 1))
+
+        def tf_idf(word, doc, docs):
+            return tf(word, doc) * idf(word, docs)
+
+        count_dict = defaultdict(int)
+        vec = np.zeros(len(bag_of_ngramms))
+        for word in doc:
+            count_dict[word] += tf_idf(word, doc, docs)
+
+        for key, item in count_dict.items():
+            vec[bag_of_ngramms[key]] = item
+        return vec
+
+    def cosine_similarity(self, vector1, vector2):
+        norm1 = np.linalg.norm(vector1)
+        norm2 = np.linalg.norm(vector2)
+        dot_product = np.dot(vector1, vector2)
+        if norm1 == 0.0 or norm2 == 0.0:
+            return 0
+        cosine_sim = dot_product / (norm1 * norm2)
+        return round(cosine_sim, 3)
+
+    def calculate_cosine_similarity(self, goal, texts: dict):
+        if not (goal or texts):
+            return
+        corpus = []
+        text1_n_grams = self.get_ngrams(self.preprocessing(goal))
+        text2_n_grams = {}
+        for chapter in texts.keys():
+            text2_n_grams[chapter] = self.get_ngrams(self.preprocessing(texts[chapter]))
+        corpus.append(text1_n_grams)
+        corpus.extend(text2_n_grams.values())
+        bag_of_n_grams = self.get_bag_of_n_gramms(corpus)
+        goal_vector = self.get_vector_by_BOW(bag_of_n_grams, text1_n_grams, corpus)
+        text_vectors = {}
+        for chapter, text in text2_n_grams.items():
+            text_vectors[chapter] = self.get_vector_by_BOW(bag_of_n_grams, text, corpus)
+        result = {}
+        for chapter in text_vectors.keys():
+            text_vector = text_vectors[chapter]
+            result[chapter] = self.cosine_similarity(goal_vector, text_vector)
+        return result