Skip to content
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
d6b163c
v1
Sep 27, 2024
88f199c
v1.1
Sep 29, 2024
5ecde02
v2: edit cases
Sep 30, 2024
52d1afe
prototype: images readability check
Dariiiii Feb 6, 2025
e783ed9
fix image_quality_check
Dariiiii Mar 6, 2025
5cc96ec
v1 image_quality_check
Dariiiii Mar 6, 2025
c15f5ab
tesseract prototype
Dariiiii Mar 20, 2025
f645a68
TODO: Implement Tesseract-based text check
Dariiiii Mar 20, 2025
40cfc2d
tesseract check v1
Dariiiii Apr 2, 2025
b7acfcd
add TASK_SOFT_TIME_LIMIT
Dariiiii Apr 2, 2025
456e238
Merge branch 'master' into image_check
HadronCollider Apr 10, 2025
89ee03b
first fix
Dariiiii Apr 14, 2025
c59c475
trial version
Dariiiii Apr 16, 2025
3f25405
correction of tesseract
Dariiiii Apr 16, 2025
7906f70
Merge branch 'master' into tesseract-integration
Dariiiii Apr 17, 2025
7c195c8
fix update_tesseract_criteria_result
Dariiiii Apr 17, 2025
40f51be
update 469_extend_data_storage_model
Dariiiii Apr 17, 2025
5fa3014
Merge branch 'image_check' into tesseract-integration
HadronCollider Apr 22, 2025
24eb092
update docker base tag
HadronCollider Apr 22, 2025
fc8e0c1
Merge remote-tracking branch 'origin/master' into tesseract-integration
HadronCollider Apr 22, 2025
d05230a
Merge branch 'master' into tesseract-integration
HadronCollider Apr 22, 2025
57bee01
correction of comments
Dariiiii Apr 24, 2025
3b18e36
remove the typo
Dariiiii Apr 24, 2025
050163a
fix bug
Dariiiii May 10, 2025
3f3ef52
Merge branch 'master' into tesseract-integration
HadronCollider Nov 10, 2025
5796e5f
update tesseract_worker volume
HadronCollider Nov 10, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion Dockerfile_base
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@ ENV TZ=Europe/Moscow

RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone

RUN apt update && apt install -y libreoffice-writer libreoffice-impress default-jre
RUN apt-get update && apt-get install -y \
libreoffice-writer \
libreoffice-impress \
default-jre \
tesseract-ocr \
tesseract-ocr-rus

ADD requirements.txt .
RUN python3 -m pip install -r requirements.txt --no-cache-dir
46 changes: 45 additions & 1 deletion app/db/db_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pymongo import MongoClient
from utils import convert_to

from .db_types import User, Presentation, Check, Consumers, Logs
from .db_types import User, Presentation, Check, Consumers, Logs, Image

client = MongoClient("mongodb://mongodb:27017")
db = client['pres-parser-db']
Expand All @@ -21,11 +21,55 @@
logs_collection = db.create_collection(
'logs', capped=True, size=5242880) if not db['logs'] else db['logs']
celery_check_collection = db['celery_check'] # collection for mapping celery_task to check
images_collection = db['images'] # коллекция для хранения изображений


def get_client():
return client

def get_images(check_id):
images = images_collection.find({'check_id': str(check_id)})
if images is not None:
image_list = []
for img in images:
image_list.append(Image(img))
return image_list
else:
return None

def save_image_to_db(check_id, image_data, caption, image_size, text=None, tesseract_task_id=None):
image = Image({
'check_id': check_id,
'image_data': image_data,
'caption': caption,
'image_size': image_size,
'text' : text,
'tesseract_task_id': tesseract_task_id
})
result = images_collection.insert_one(image.pack())
return result.inserted_id

def add_image_text(tesseract_task_id, new_text):
result = images_collection.update_one(
{'tesseract_task_id': tesseract_task_id},
{'$set': {'text': new_text}}
)
return result.matched_count > 0

def add_tesseract_task_id(image_id, tesseract_task_id):
result = images_collection.update_one(
{'_id': image_id},
{'$set': {'tesseract_task_id': tesseract_task_id}}
)
return result.matched_count > 0

def get_tesseract_task_id(image_id):
image = images_collection.find_one({'_id': image_id})
if image:
return image.get('tesseract_task_id')
else:
return None


# Returns user if user was created and None if already exists
def add_user(username, password_hash='', is_LTI=False):
Expand Down
21 changes: 21 additions & 0 deletions app/db/db_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,3 +145,24 @@ def none_to_false(x):
is_ended = none_to_true(self.is_ended) # None for old checks => True, True->True, False->False
is_failed = none_to_false(self.is_failed) # None for old checks => False, True->True, False->False
return {'is_ended': is_ended, 'is_failed': is_failed}

class Image(PackableWithId):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Мы планируем уйти от PackableWithId в сторону "нормальной" mongo document model (с указанием типов полей и прочего), поэтому предлагаю новые модели делать с помощью них (поддержав нужны операции)

def __init__(self, dictionary=None):
super().__init__(dictionary)
dictionary = dictionary or {}
self.check_id = dictionary.get('check_id') # Привязка к check_id
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Возможно тут стоит сохранять и id документа - 99% уверенности, что у него сейчас ID одинаковый с проверкой, но в будущем возможны изменения (и тогда документ будет, например, один, а проверок с ним несколько), сохранить изображения хватит один раз именно для документа

self.caption = dictionary.get('caption', '') # Подпись к изображению
self.image_data = dictionary.get('image_data') # Файл изображения в формате bindata
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Полезный момент на будущее - добавить checksum на случай дубликатов (чтобы одна одинаковая фотка в 100 отправок / отчетах нам не занимала лишнее место и ресурсы на обработку)

self.image_size = dictionary.get('image_size') # Размер изображения в сантимерах
self.text = dictionary.get('text')
self.tesseract_task_id = dictionary.get('tesseract_task_id')

def pack(self):
package = super().pack()
package['check_id'] = str(self.check_id)
package['caption'] = self.caption
package['image_data'] = self.image_data
package['image_size'] = self.image_size
package['text'] = self.text
package['tesseract_task_id'] = self.tesseract_task_id
return package
1 change: 1 addition & 0 deletions app/main/check_packs/pack_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
["theme_in_report_check"],
['key_words_report_check'],
["empty_task_page_check"],
["image_text_check"],
]

DEFAULT_TYPE = 'pres'
Expand Down
1 change: 1 addition & 0 deletions app/main/checks/report_checks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@
from .template_name import ReportTemplateNameCheck
from .key_words_check import KeyWordsReportCheck
from .empty_task_page_check import EmptyTaskPageCheck
from .image_text_check import ImageTextCheck
77 changes: 77 additions & 0 deletions app/main/checks/report_checks/image_text_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import re
from ..base_check import BaseReportCriterion, answer
import time
from celery.result import AsyncResult

class ImageTextCheck(BaseReportCriterion):
label = "Проверка текста, считанного с изображений"
description = ''
id = 'image_text_check'
# Подобрать значения для symbols_set, max_symbols_percentage, max_text_density
def __init__(self, file_info, symbols_set=['%', '1'], max_symbols_percentage=0, max_text_density=0, max_wait_time=30):
super().__init__(file_info)
self.images = self.file.images
self.symbols_set = symbols_set
self.max_symbols_percentage = max_symbols_percentage
self.max_text_density = max_text_density
self.max_wait_time = max_wait_time

def check(self):
deny_list = []
if self.images:
for image in self.images:
if image.text == '':
continue
recognized_text = self.wait_for_text_recognition(image)
width, height = image.image_size
if not recognized_text:
continue
text_density = self.calculate_text_density(recognized_text, width, height)
if text_density > self.max_text_density:
deny_list.append(
f"Изображение с подписью '{image.caption}' имеет слишком высокую плотность текста: "
f"{text_density:.4f} (максимум {self.max_text_density}). Это может означать, что текст нечитаем.<br>"
)
symbols_count = self.count_symbols_in_text(recognized_text, self.symbols_set)
text_length = len(recognized_text)
symbols_percentage = (symbols_count / text_length) * 100
if symbols_percentage > self.max_symbols_percentage:
deny_list.append(
f"На изображении с подписью '{image.caption}' содержится слишком много неверно распознанных символов: "
f"{symbols_percentage:.2f}% (максимум {self.max_symbols_percentage}%). Это может означать, что размер шрифта слишком маленький или текст нечитаем.<br>"
)
else:
return answer(False, 'Изображения не найдены!')
if deny_list:
return answer(False, f'Проблемы с текстом на изображениях! <br>{"".join(deny_list)}')
else:
return answer(True, 'Текст на изображениях корректен!')

def count_symbols_in_text(self, text, symbols_set):
return sum(1 for char in text if char in symbols_set)

def calculate_text_density(self, text, width, height):
text_without_spaces = ''.join(text.split())
image_area = width * height
if image_area == 0:
return 0
return len(text_without_spaces) / image_area

def wait_for_text_recognition(self, image):
from app.db.db_methods import add_image_text
start_time = time.time()
task_id = image.tesseract_task_id
if not task_id:
return None

while time.time() - start_time < self.max_wait_time:
task_result = AsyncResult(task_id)
if task_result.state == 'SUCCESS':
recognized_text = task_result.result
recognized_text = re.sub(r'\s+', ' ', recognized_text)
image.text = recognized_text
add_image_text(task_id, recognized_text)
return recognized_text.strip()
time.sleep(1)
Copy link
Collaborator

@HadronCollider HadronCollider Apr 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

кажется, подобный подход ожидания не самый лучший (мы по факту блокируем всю проверку / очередь) - можно ли сделать "заглушку" (по типу фидбека "проверяется" в этой проверке), а в celery-задаче с тессерактом после распознавания и обработки - обновлять данные в БД проверки? но стоит добавить какую-то проверку, не слишком ли долго тессеракт обрабатывает картинку или вообще её не выполнил (чтобы обновить фидбек/результат критерия в соответствии со сложившейся ситуацией)


return None
39 changes: 36 additions & 3 deletions app/main/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,39 @@
from main.reports.md_uploader import MdUploader
from utils import convert_to

logger = logging.getLogger('root_logger')
from os.path import basename
from app.db.db_methods import add_check
from app.db.db_types import Check

logger = logging.getLogger('root_logger')

def parse(filepath, pdf_filepath):
from app.db.db_methods import files_info_collection

tmp_filepath = filepath.lower()
try:
if tmp_filepath.endswith(('.odp', '.ppt', '.pptx')):
new_filepath = filepath
if tmp_filepath.endswith(('.odp', '.ppt')):
logger.info(f"Презентация {filepath} старого формата. Временно преобразована в pptx для обработки.")
new_filepath = convert_to(filepath, target_format='pptx')
file_object = PresentationPPTX(new_filepath)

presentation = PresentationPPTX(new_filepath)

check = Check({
'filename': basename(new_filepath),
})

file_id = 0
file = files_info_collection.find_one({'name': basename(new_filepath)})
if file:
file_id = file['_id']

check_id = add_check(file_id, check)
presentation.extract_images_with_captions(check_id)
file_object = presentation


elif tmp_filepath.endswith(('.doc', '.odt', '.docx', )):
new_filepath = filepath
if tmp_filepath.endswith(('.doc', '.odt')):
Expand All @@ -28,7 +49,19 @@ def parse(filepath, pdf_filepath):

docx = DocxUploader()
docx.upload(new_filepath, pdf_filepath)

check = Check({
'filename': basename(new_filepath),
})

file_id = 0
file = files_info_collection.find_one({'name': basename(new_filepath)})
if file:
file_id = file['_id']

check_id = add_check(file_id, check)
docx.parse()
docx.extract_images_with_captions(check_id)
file_object = docx

elif tmp_filepath.endswith('.md' ):
Expand All @@ -54,4 +87,4 @@ def save_to_temp_file(file):
temp_file.write(file.read())
temp_file.close()
file.seek(0)
return temp_file.name
return temp_file.name
39 changes: 39 additions & 0 deletions app/main/presentations/pptx/presentation_pptx.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from io import BytesIO

from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE

from .slide_pptx import SlidePPTX
from ..presentation_basic import PresentationBasic
Expand All @@ -17,3 +20,39 @@ def add_slides(self):

def __str__(self):
return super().__str__()

def extract_images_with_captions(self, check_id):
from app.db.db_methods import save_image_to_db

# Проход по каждому слайду в презентации
for slide in self.slides:
image_found = False
image_data = None
caption_text = None

# Проход по всем фигурам на слайде
for shape in slide.slide.shapes: # Используем slide.slide для доступа к текущему слайду
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
image_found = True
image_part = shape.image # Получаем объект изображения

# Извлекаем бинарные данные изображения
image_stream = image_part.blob
image_data = BytesIO(image_stream)

# Если мы нашли изображение, ищем следующий непустой текст как подпись
if image_found:
for shape in slide.slide.shapes:
if not shape.has_text_frame:
continue
text = shape.text.strip()
if text: # Находим непустое текстовое поле (предположительно, это подпись)
caption_text = text
# Сохраняем изображение и его подпись
save_image_to_db(check_id, image_data.getvalue(), caption_text)
break # Предполагаем, что это подпись к текущему изображению

# Сброс флага и данных изображения для следующего цикла
image_found = False
image_data = None
caption_text = None
1 change: 1 addition & 0 deletions app/main/reports/document_uploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ def __init__(self):
self.literature_page = 0
self.first_lines = []
self.page_count = 0
self.images = []

@abstractmethod
def upload(self):
Expand Down
47 changes: 47 additions & 0 deletions app/main/reports/docx_uploader/docx_uploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from ..document_uploader import DocumentUploader



class DocxUploader(DocumentUploader):
def __init__(self):
super().__init__()
Expand Down Expand Up @@ -242,6 +243,52 @@ def show_chapters(self, work_type):
chapters_str += "&nbsp;&nbsp;&nbsp;&nbsp;" + header["text"] + "<br>"
return chapters_str

def extract_images_with_captions(self, check_id):
from app.db.db_methods import save_image_to_db, get_images, add_tesseract_task_id
from app.tesseract_tasks import tesseract_recognize

emu_to_cm = 360000
image_found = False
image_data = None
if not self.images:
for i, paragraph in enumerate(self.file.paragraphs):
for run in paragraph.runs:
if "graphic" in run._element.xml:
image_streams = run._element.findall('.//a:blip', namespaces={
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'})
for image_stream in image_streams:
embed_id = image_stream.get(
'{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
if embed_id:
image_found = True
image_part = self.file.part.related_parts[embed_id]
image_data = image_part.blob
extent = run._element.find('.//wp:extent', namespaces={
'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
width_cm = height_cm = None
if extent is not None:
width_cm = int(extent.get('cx')) / emu_to_cm
height_cm = int(extent.get('cy')) / emu_to_cm
if image_found:
caption = "picture without caption"
next_paragraph_index = i + 1
while next_paragraph_index < len(self.file.paragraphs):
next_paragraph = self.file.paragraphs[next_paragraph_index]
next_text = next_paragraph.text.strip()
if next_text and not any("graphic" in r._element.xml for r in next_paragraph.runs):
caption = next_text
break
next_paragraph_index += 1

image_id = save_image_to_db(check_id, image_data, caption, (width_cm, height_cm))
task = tesseract_recognize.delay(str(image_id), image_data)
add_tesseract_task_id(image_id, task.id)
image_found = False
image_data = None

self.images = get_images(check_id)



def main(args):
file = args.file
Expand Down
Loading
Loading