-
Notifications
You must be signed in to change notification settings - Fork 2
tesseract_integration #656
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 10 commits
d6b163c
88f199c
5ecde02
52d1afe
e783ed9
5cc96ec
c15f5ab
f645a68
40cfc2d
b7acfcd
456e238
89ee03b
c59c475
3f25405
7906f70
7c195c8
40f51be
5fa3014
24eb092
fc8e0c1
d05230a
57bee01
3b18e36
050163a
3f3ef52
5796e5f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -145,3 +145,24 @@ def none_to_false(x): | |
| is_ended = none_to_true(self.is_ended) # None for old checks => True, True->True, False->False | ||
| is_failed = none_to_false(self.is_failed) # None for old checks => False, True->True, False->False | ||
| return {'is_ended': is_ended, 'is_failed': is_failed} | ||
|
|
||
| class Image(PackableWithId): | ||
| def __init__(self, dictionary=None): | ||
| super().__init__(dictionary) | ||
| dictionary = dictionary or {} | ||
| self.check_id = dictionary.get('check_id') # Привязка к check_id | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Возможно тут стоит сохранять и id документа - 99% уверенности, что у него сейчас ID одинаковый с проверкой, но в будущем возможны изменения (и тогда документ будет, например, один, а проверок с ним несколько), сохранить изображения хватит один раз именно для документа |
||
| self.caption = dictionary.get('caption', '') # Подпись к изображению | ||
| self.image_data = dictionary.get('image_data') # Файл изображения в формате bindata | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Полезный момент на будущее - добавить checksum на случай дубликатов (чтобы одна одинаковая фотка в 100 отправок / отчетах нам не занимала лишнее место и ресурсы на обработку) |
||
| self.image_size = dictionary.get('image_size') # Размер изображения в сантимерах | ||
| self.text = dictionary.get('text') | ||
| self.tesseract_task_id = dictionary.get('tesseract_task_id') | ||
|
|
||
| def pack(self): | ||
| package = super().pack() | ||
| package['check_id'] = str(self.check_id) | ||
| package['caption'] = self.caption | ||
| package['image_data'] = self.image_data | ||
| package['image_size'] = self.image_size | ||
| package['text'] = self.text | ||
| package['tesseract_task_id'] = self.tesseract_task_id | ||
| return package | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,77 @@ | ||
| import re | ||
| from ..base_check import BaseReportCriterion, answer | ||
| import time | ||
| from celery.result import AsyncResult | ||
|
|
||
| class ImageTextCheck(BaseReportCriterion): | ||
| label = "Проверка текста, считанного с изображений" | ||
| description = '' | ||
| id = 'image_text_check' | ||
| # Подобрать значения для symbols_set, max_symbols_percentage, max_text_density | ||
| def __init__(self, file_info, symbols_set=['%', '1'], max_symbols_percentage=0, max_text_density=0, max_wait_time=30): | ||
| super().__init__(file_info) | ||
| self.images = self.file.images | ||
| self.symbols_set = symbols_set | ||
| self.max_symbols_percentage = max_symbols_percentage | ||
| self.max_text_density = max_text_density | ||
| self.max_wait_time = max_wait_time | ||
|
|
||
| def check(self): | ||
| deny_list = [] | ||
| if self.images: | ||
| for image in self.images: | ||
| if image.text == '': | ||
| continue | ||
| recognized_text = self.wait_for_text_recognition(image) | ||
| width, height = image.image_size | ||
| if not recognized_text: | ||
| continue | ||
| text_density = self.calculate_text_density(recognized_text, width, height) | ||
| if text_density > self.max_text_density: | ||
| deny_list.append( | ||
| f"Изображение с подписью '{image.caption}' имеет слишком высокую плотность текста: " | ||
| f"{text_density:.4f} (максимум {self.max_text_density}). Это может означать, что текст нечитаем.<br>" | ||
| ) | ||
| symbols_count = self.count_symbols_in_text(recognized_text, self.symbols_set) | ||
| text_length = len(recognized_text) | ||
| symbols_percentage = (symbols_count / text_length) * 100 | ||
| if symbols_percentage > self.max_symbols_percentage: | ||
| deny_list.append( | ||
| f"На изображении с подписью '{image.caption}' содержится слишком много неверно распознанных символов: " | ||
| f"{symbols_percentage:.2f}% (максимум {self.max_symbols_percentage}%). Это может означать, что размер шрифта слишком маленький или текст нечитаем.<br>" | ||
| ) | ||
| else: | ||
| return answer(False, 'Изображения не найдены!') | ||
| if deny_list: | ||
| return answer(False, f'Проблемы с текстом на изображениях! <br>{"".join(deny_list)}') | ||
| else: | ||
| return answer(True, 'Текст на изображениях корректен!') | ||
|
|
||
| def count_symbols_in_text(self, text, symbols_set): | ||
| return sum(1 for char in text if char in symbols_set) | ||
|
|
||
| def calculate_text_density(self, text, width, height): | ||
| text_without_spaces = ''.join(text.split()) | ||
| image_area = width * height | ||
| if image_area == 0: | ||
| return 0 | ||
| return len(text_without_spaces) / image_area | ||
|
|
||
| def wait_for_text_recognition(self, image): | ||
| from app.db.db_methods import add_image_text | ||
| start_time = time.time() | ||
| task_id = image.tesseract_task_id | ||
| if not task_id: | ||
| return None | ||
|
|
||
| while time.time() - start_time < self.max_wait_time: | ||
| task_result = AsyncResult(task_id) | ||
| if task_result.state == 'SUCCESS': | ||
| recognized_text = task_result.result | ||
| recognized_text = re.sub(r'\s+', ' ', recognized_text) | ||
| image.text = recognized_text | ||
| add_image_text(task_id, recognized_text) | ||
| return recognized_text.strip() | ||
| time.sleep(1) | ||
|
||
|
|
||
| return None | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Мы планируем уйти от PackableWithId в сторону "нормальной" mongo document model (с указанием типов полей и прочего), поэтому предлагаю новые модели делать с помощью них (поддержав нужны операции)