Skip to content
Open
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion app/main/check_packs/pack_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@
["header_check"],
["report_section_component"],
["main_text_check"],
["spelling_check"]
["spelling_check"],
["compare_goal_and_content_check"],
["compare_tasks_and_content_check"]
]

DEFAULT_TYPE = 'pres'
Expand Down
4 changes: 3 additions & 1 deletion app/main/checks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
ReportChapters.id: ReportChapters,
ReportSectionComponent.id: ReportSectionComponent,
ReportMainTextCheck.id: ReportMainTextCheck,
SpellingCheck.id: SpellingCheck
SpellingCheck.id: SpellingCheck,
CompareGoalAndContentCheck.id: CompareGoalAndContentCheck,
CompareTasksAndContentCheck.id: CompareTasksAndContentCheck
}
}
3 changes: 2 additions & 1 deletion app/main/checks/report_checks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,5 @@
from .sections_check import LRReportSectionCheck
from .style_check import ReportStyleCheck
from .spelling_check import SpellingCheck

from .compare_goal_and_content import CompareGoalAndContentCheck
from .compare_tasks_and_content import CompareTasksAndContentCheck
86 changes: 86 additions & 0 deletions app/main/checks/report_checks/compare_goal_and_content.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from ..base_check import BaseReportCriterion, answer

import app.nlp.text_similarity as ts


class CompareGoalAndContentCheck(BaseReportCriterion):
description = "Проверка соответствия цели и содержания"
id = 'compare_goal_and_content_check'

def __init__(self, file_info):
super().__init__(file_info)
self.headers = []
self.goal = ""
self.chapters = {}
self.weights = {}
self.to_pass = 0
self.to_ignore = []

def late_init(self):
self.headers = self.file.make_chapters(self.file_type['report_type'])
self.weights = {
"ВВЕДЕНИЕ": 1,
"1": 2,
"2": 2,
"3": 5,
"4": 2,
"5": 1,
"ЗАКЛЮЧЕНИЕ": 1
}
self.to_pass = 0.1
self.to_ignore = ["СПИСОК ИСПОЛЬЗОВАННЫХ ИСТОЧНИКОВ", "ПРИЛОЖЕНИЕ"]

def check(self):
self.late_init()
if self.file.page_counter() < 4:
return answer(False, "В отчете недостаточно страниц. Нечего проверять.")
result = ""
intro_text = ""
for header in self.headers:
if header["text"] == "ВВЕДЕНИЕ":
for child in header["child"]:
intro_text += child["text"]
goal_index = intro_text.find("Цель")
if goal_index > 0:
goal_start = goal_index + len("Цель") + 1
goal_end = intro_text.find(".", goal_start)
self.goal = intro_text[goal_start:goal_end]
else:
return answer(False, "В введении не найдена цель работы")
for header in self.headers:
if any(ignore_phrase in header["text"] for ignore_phrase in self.to_ignore):
continue
text = ""
for child in header["child"]:
text += child['text']
self.chapters[header["text"]] = text
self.chapters = {k: v for k, v in self.chapters.items() if v and v.strip()}
NLPProcessor = ts.NLPProcessor()
calculate_result = NLPProcessor.calculate_cosine_similarity(self.goal, self.chapters)
max_result = max(calculate_result.values())
for k, v in calculate_result.items():
for chapter, weight in self.weights.items():
if k.find(chapter) == 0:
calculate_result[k] = v * weight
break
calculate_result[k] = calculate_result[k] / max_result
avg = round(sum(calculate_result.values()) / len(calculate_result.values()), 3)
if avg < self.to_pass:
return answer(False,
f"Цель недостаточно раскрыта в содержании (нужно {self.to_pass * 100}%, набрано {avg * 100}%)")
result += f"<br><b>Тема раскрыта на {avg * 100}%</b><br>"
sorted_chapters = dict(sorted(calculate_result.items(), key=lambda item: item[1], reverse=True))
result += f"<br><b>7 разделов, наиболее раскрывающих тему:</b><br>"
for i, key in enumerate(sorted_chapters.keys()):
if i >= 7:
break
result += f"<br>\"{key}\", {round(self.__output(sorted_chapters[key], sum(sorted_chapters.values())), 3)}% текста раскрывают тему<br>"
result += f"<br><b>7 разделов, наименее раскрывающих тему:</b><br>"
for i, key in enumerate(sorted_chapters.keys()):
if i < len(sorted_chapters) - 7:
continue
result += f"<br>\"{key}\", {self.__output(sorted_chapters[key], sum(sorted_chapters.values()))}% текста раскрывают тему<br>"
return answer(True, result)

def __output(self, value, summ):
return round(value / summ, 3) * 100
100 changes: 100 additions & 0 deletions app/main/checks/report_checks/compare_tasks_and_content.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
from ..base_check import BaseReportCriterion, answer

import app.nlp.text_similarity as ts


class CompareTasksAndContentCheck(BaseReportCriterion):
description = "Проверка соответствия задач и содержания"
id = 'compare_tasks_and_content_check'

def __init__(self, file_info):
super().__init__(file_info)
self.headers = []
self.tasks = []
self.chapters = {}
self.weights = {}
self.all_to_pass = 0
self.specific_to_pass = 0
self.to_ignore = []

def late_init(self):
self.headers = self.file.make_chapters(self.file_type['report_type'])
self.weights = {
"ВВЕДЕНИЕ": 1,
"1": 2,
"2": 2,
"3": 5,
"4": 2,
"5": 1,
"ЗАКЛЮЧЕНИЕ": 1
}
self.all_to_pass = 0.15
self.specific_to_pass = 0.05
self.to_ignore = ["СПИСОК ИСПОЛЬЗОВАННЫХ ИСТОЧНИКОВ", "ПРИЛОЖЕНИЕ"]

def check(self):
self.late_init()
if self.file.page_counter() < 4:
return answer(False, "В отчете недостаточно страниц. Нечего проверять.")
result = ""
possible_tasks = []
for header in self.headers:
if header["text"] == "ВВЕДЕНИЕ":
for i, child in enumerate(header["child"]):
if child["text"].lower().find("задачи") >= 0:
possible_tasks.append(i)
if child["text"].lower().find("объект") >= 0 and child["text"].lower().find("исследования") > 0:
if not possible_tasks:
return answer(False, "В введении не найдены задачи работы")
tasks = header["child"][max(possible_tasks) + 1:i]
self.tasks = [task["text"] for task in tasks]
break
if any(ignore_phrase in header["text"] for ignore_phrase in self.to_ignore):
continue
text = ""
for child in header["child"]:
text += child['text']
self.chapters[header["text"]] = text
self.chapters = {k: v for k, v in self.chapters.items() if v and v.strip()}
NLPProcessor = ts.NLPProcessor()
all_tasks_result = NLPProcessor.calculate_cosine_similarity(" ".join(self.tasks), self.chapters)
max_result = max(all_tasks_result.values())
for k, v in all_tasks_result.items():
for chapter, weight in self.weights.items():
if k.find(chapter) == 0:
all_tasks_result[k] = v * weight
break
all_tasks_result[k] = round(all_tasks_result[k] / max_result, 3)
avg = round(sum(all_tasks_result.values()) / len(all_tasks_result.values()), 3)
if avg < self.all_to_pass:
return answer(False, f"Задачи недостаточно раскрыты в содержании (нужно {self.all_to_pass * 100}%, набрано {avg * 100}%)")
result += f"<br><b>Задачи раскрыты на {avg * 100}%</b><br>"
for task in self.tasks:
cur_task = NLPProcessor.calculate_cosine_similarity(task, self.chapters)
max_result = max(cur_task.values())
for k, v in cur_task.items():
for chapter, weight in self.weights.items():
if k.find(chapter) == 0:
cur_task[k] = v * weight
break
cur_task[k] = cur_task[k] / max_result
sorted_chapters = dict(sorted(cur_task.items(), key=lambda item: item[1], reverse=True))
specific_avg = sum(sorted_chapters.values()) / len(sorted_chapters.values())
specific_avg = round(specific_avg, 3)
if specific_avg < self.specific_to_pass:
return answer(False, f"<br>Задача \"{task}\" недостаточно раскрыта<br>")
result += f"<br><b>Задача \"{task}\" раскрыта на {round(specific_avg * 100, 2)}%</b><br><br>Задачу \"{task}\" наиболее раскрывают разделы: <br>"
for i, key in enumerate(sorted_chapters.keys()):
if i >= 3:
break
result += f"<br>\"{key}\", {round(self.__output(sorted_chapters[key], sum(sorted_chapters.values())), 3)}% текста раскрывают задачу<br>"
all_tasks_result = dict(sorted(all_tasks_result.items(), key=lambda item: item[1], reverse=True))
result += f"<br><b>Разделы, наименее раскрывающие задачи:</b><br>"
for i, key in enumerate(all_tasks_result.keys()):
if i < len(all_tasks_result.keys()) - 5:
continue
result += f"<br>{key}: {round(all_tasks_result[key], 3) * 100}%<br>"
return answer(True, result)

def __output(self, value, summ):
return (value / summ) * 100
97 changes: 97 additions & 0 deletions app/nlp/text_similarity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
from collections import defaultdict

import nltk
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.util import ngrams
import string


class NLPProcessor:
def __init__(self, language='russian'):
nltk.download('punkt')
nltk.download('stopwords')
self.stop_words = set(stopwords.words(language))
self.stemmer = SnowballStemmer(language)

def preprocessing(self, text):
text = text.translate(str.maketrans('', '', string.punctuation))
tokens = word_tokenize(text)
tokens = [word for word in tokens if word.lower() not in self.stop_words]
return [self.stemmer.stem(token) for token in tokens]

def get_ngrams(self, tokens, n=2):
result = []
for i in range(n):
n_grams = ngrams(tokens, i + 1)
result.extend([' '.join(grams) for grams in n_grams])
return result

def get_bag_of_n_gramms(self, corpus):
new_corpus = []
for item in corpus:
for n_gramm in item:
new_corpus.append(n_gramm)
index_word = {}
i = 0
for word in new_corpus:
if word in index_word.keys():
continue
index_word[word] = i
i += 1
return index_word

def get_vector_by_BOW(self, bag_of_ngramms, doc, docs):
def tf(word, doc):
return doc.count(word) / len(doc)

def idf(word, docs):
word_in_docs = 0
for item in docs:
if word in item:
word_in_docs += 1
return np.log10(len(docs) / (word_in_docs + 1))

def tf_idf(word, doc, docs):
return tf(word, doc) * idf(word, docs)

count_dict = defaultdict(int)
vec = np.zeros(len(bag_of_ngramms))
for word in doc:
count_dict[word] += tf_idf(word, doc, docs)

for key, item in count_dict.items():
vec[bag_of_ngramms[key]] = item
return vec

def cosine_similarity(self, vector1, vector2):
norm1 = np.linalg.norm(vector1)
norm2 = np.linalg.norm(vector2)
dot_product = np.dot(vector1, vector2)
if norm1 == 0.0 or norm2 == 0.0:
return 0
cosine_sim = dot_product / (norm1 * norm2)
return round(cosine_sim, 3)

def calculate_cosine_similarity(self, goal, texts: dict):
if not (goal or texts):
return
corpus = []
text1_n_grams = self.get_ngrams(self.preprocessing(goal))
text2_n_grams = {}
for chapter in texts.keys():
text2_n_grams[chapter] = self.get_ngrams(self.preprocessing(texts[chapter]))
corpus.append(text1_n_grams)
corpus.extend(text2_n_grams.values())
bag_of_n_grams = self.get_bag_of_n_gramms(corpus)
goal_vector = self.get_vector_by_BOW(bag_of_n_grams, text1_n_grams, corpus)
text_vectors = {}
for chapter, text in text2_n_grams.items():
text_vectors[chapter] = self.get_vector_by_BOW(bag_of_n_grams, text, corpus)
result = {}
for chapter in text_vectors.keys():
text_vector = text_vectors[chapter]
result[chapter] = self.cosine_similarity(goal_vector, text_vector)
return result