-
Notifications
You must be signed in to change notification settings - Fork 2
750 was were check #755
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: dev
Are you sure you want to change the base?
750 was were check #755
Changes from all commits
1e3f293
b663c40
342697b
20f1842
9087e34
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,21 @@ | ||
| from ..base_check import BasePresCriterion, answer | ||
| from app.nlp.is_passive_was_were_sentence import CritreriaType, generate_output_text, get_was_were_sentences | ||
|
|
||
| class PresWasWereCheck(BasePresCriterion): | ||
| label = 'Проверка на пассивные конструкции, начинающиеся с Был/Была/Было/Были, которые можно убрать без потери смысла' | ||
| description = '' | ||
| id = 'pres_was_were_check' | ||
|
|
||
| def __init__(self, file_info, threshold=3): | ||
| super().__init__(file_info) | ||
| self.threshold = threshold | ||
|
|
||
| def check(self): | ||
| detected_sentences, total_sentences = get_was_were_sentences(self.file, CritreriaType.PRESENTATION) | ||
| if total_sentences > self.threshold: | ||
| result_str = generate_output_text(detected_sentences, CritreriaType.PRESENTATION, self.format_page_link) | ||
| result_score = 0 | ||
| else: | ||
| result_str = 'Пройдена!' | ||
| result_score = 1 | ||
| return answer(result_score, result_str) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| from ..base_check import BaseReportCriterion, answer | ||
| from app.nlp.is_passive_was_were_sentence import CritreriaType, generate_output_text, get_was_were_sentences | ||
|
|
||
| class ReportWasWereCheck(BaseReportCriterion): | ||
| label = 'Проверка на пассивные конструкции, начинающиеся с Был/Была/Было/Были, которые можно убрать без потери смысла' | ||
| description = '' | ||
| id = 'report_was_were_check' | ||
|
|
||
| def __init__(self, file_info, threshold=3): | ||
| super().__init__(file_info) | ||
| self.threshold = threshold | ||
|
|
||
| def check(self): | ||
| if self.file.page_counter() < 4: | ||
| return answer(False, 'В отчёте недостаточно страниц. Нечего проверять.') | ||
| detected, total_sentences = get_was_were_sentences(self.file, CritreriaType.REPORT) | ||
| if total_sentences > self.threshold: | ||
| result_str = generate_output_text(detected, CritreriaType.REPORT, self.format_page_link) | ||
| result_score = 0 | ||
| else: | ||
| result_str = 'Пройдена!' | ||
| result_score = 1 | ||
| return answer(result_score, result_str) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,113 @@ | ||
| import re | ||
| import pymorphy2 | ||
| import string | ||
| from enum import Enum | ||
|
|
||
| morph = pymorphy2.MorphAnalyzer() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Добавьте в данном файле в виде комментария примеры хороших и плохих предложений, которые начинаются с Был*
Comment on lines
+2
to
+6
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Подтяните изменения из dev и замените pymorphy2 на pymorphy3 |
||
|
|
||
|
|
||
| class CritreriaType(Enum): | ||
| REPORT = 'report' | ||
| PRESENTATION = 'pres' | ||
|
|
||
|
|
||
| def criteria_type_to_str(type: CritreriaType): | ||
| if type == CritreriaType.REPORT: | ||
| return "Страница" | ||
| elif type == CritreriaType.PRESENTATION: | ||
| return "Слайд" | ||
| else: | ||
| return "Элемент" | ||
|
|
||
| def get_content_by_file(file, type: CritreriaType): | ||
| if type == CritreriaType.REPORT: | ||
| return file.pdf_file.get_text_on_page().items() | ||
| elif type == CritreriaType.PRESENTATION: | ||
| return enumerate(file.get_text_from_slides()) | ||
|
|
||
| def clean_word(word): | ||
| punct = string.punctuation.replace('-', '') | ||
| return word.translate(str.maketrans('', '', punct)) | ||
|
|
||
|
|
||
| def is_passive_was_were_sentece(sentence): | ||
| """ | ||
| Примеры плохих предложений (пассивные конструкции с "Был*" - можно убрать): | ||
| - Был проведен анализ данных | ||
| - Была выполнена работа по исследованию | ||
| - Было принято решение о внедрении | ||
| - Были получены следующие результаты | ||
| - Была создана база данных | ||
|
|
||
| Примеры хороших предложений ("Был*" нельзя убрать): | ||
| - Было бы здорово получить новые данные | ||
| - Был сильный скачок напряжения | ||
| - Были времена, когда это казалось невозможным | ||
| - Был студентом университета три года назад | ||
| - Была программистом до выхода на пенсию | ||
| """ | ||
| first_words = re.split(r'\s+', sentence.strip(), maxsplit=2) | ||
| if len(first_words) < 2: | ||
| return False | ||
|
|
||
| first_word = clean_word(first_words[0]) | ||
| second_word = clean_word(first_words[1]) | ||
|
|
||
| parsed = morph.parse(first_word)[0] | ||
| if (parsed.normal_form == 'быть' and | ||
| 'past' in parsed.tag and | ||
| parsed.tag.POS == 'VERB'): | ||
| second_word_parsed = morph.parse(second_word)[0] | ||
| return ('PRTS' in second_word_parsed.tag and | ||
| 'pssv' in second_word_parsed.tag) | ||
| return False | ||
|
|
||
|
|
||
| def generate_output_text(detected_senteces, type: CritreriaType, format_page_link_fn=None): | ||
| output = 'Обнаружены конструкции (Был/Была/Было/Были), которые можно удалить без потери смысла:<br><br>' | ||
| if type == CritreriaType.REPORT: | ||
| offset_index = 0 | ||
| elif type == CritreriaType.PRESENTATION: | ||
| offset_index = 1 | ||
| for index, messages in detected_senteces.items(): | ||
| display_index = index + offset_index | ||
| output_type = criteria_type_to_str(type) | ||
| if format_page_link_fn: | ||
| output += f'<b>{output_type} {format_page_link_fn([display_index])}:</b> <br>' + '<br>'.join(messages) + '<br><br>' | ||
| else: | ||
| output += f'<b>{output_type} №{display_index}:</b> <br>' + '<br>'.join(messages) + '<br><br>' | ||
| return output | ||
|
|
||
|
|
||
| def get_was_were_sentences(file, type: CritreriaType): | ||
| detected = {} | ||
| total_sentences = 0 | ||
| for page_index, page_text in get_content_by_file(file, type): | ||
| lines = re.split(r'\n', page_text) | ||
| non_empty_line_counter = 0 | ||
| for line_index, line in enumerate(lines): | ||
| print(line_index, line) | ||
| line = line.strip() | ||
| if not line: | ||
| continue | ||
|
|
||
| non_empty_line_counter += 1 | ||
| sentences = re.split(r'[.!?…]+\s*', line) | ||
|
|
||
| for sentence in sentences: | ||
| sentence = sentence.strip() | ||
| if not sentence: | ||
| continue | ||
|
|
||
| if is_passive_was_were_sentece(sentence): | ||
| total_sentences += 1 | ||
| if page_index not in detected: | ||
| detected[page_index] = [] | ||
| truncated_sentence = sentence[:50] + '...' if len(sentence) > 50 else sentence | ||
| if type == CritreriaType.PRESENTATION: | ||
| err_str = f'Строка {non_empty_line_counter}: {truncated_sentence}' | ||
| elif type == CritreriaType.REPORT: | ||
| err_str = f'Строка {line_index+1}: {truncated_sentence}' | ||
| detected[page_index].append(err_str) | ||
|
|
||
| return detected, total_sentences | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Добавьте в случае ненулевого, но не критичного total_sentences фидбек с указанием проблемных мест (но полным баллом)