Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
d6b163c
v1
Sep 27, 2024
88f199c
v1.1
Sep 29, 2024
5ecde02
v2: edit cases
Sep 30, 2024
52d1afe
prototype: images readability check
Dariiiii Feb 6, 2025
e783ed9
fix image_quality_check
Dariiiii Mar 6, 2025
5cc96ec
v1 image_quality_check
Dariiiii Mar 6, 2025
c15f5ab
tesseract prototype
Dariiiii Mar 20, 2025
f645a68
TODO: Implement Tesseract-based text check
Dariiiii Mar 20, 2025
40cfc2d
tesseract check v1
Dariiiii Apr 2, 2025
b7acfcd
add TASK_SOFT_TIME_LIMIT
Dariiiii Apr 2, 2025
456e238
Merge branch 'master' into image_check
HadronCollider Apr 10, 2025
89ee03b
first fix
Dariiiii Apr 14, 2025
c59c475
trial version
Dariiiii Apr 16, 2025
3f25405
correction of tesseract
Dariiiii Apr 16, 2025
7906f70
Merge branch 'master' into tesseract-integration
Dariiiii Apr 17, 2025
7c195c8
fix update_tesseract_criteria_result
Dariiiii Apr 17, 2025
40f51be
update 469_extend_data_storage_model
Dariiiii Apr 17, 2025
5fa3014
Merge branch 'image_check' into tesseract-integration
HadronCollider Apr 22, 2025
24eb092
update docker base tag
HadronCollider Apr 22, 2025
fc8e0c1
Merge remote-tracking branch 'origin/master' into tesseract-integration
HadronCollider Apr 22, 2025
d05230a
Merge branch 'master' into tesseract-integration
HadronCollider Apr 22, 2025
57bee01
correction of comments
Dariiiii Apr 24, 2025
3b18e36
remove the typo
Dariiiii Apr 24, 2025
050163a
fix bug
Dariiiii May 10, 2025
3f3ef52
Merge branch 'master' into tesseract-integration
HadronCollider Nov 10, 2025
5796e5f
update tesseract_worker volume
HadronCollider Nov 10, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion Dockerfile_base
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@ ENV TZ=Europe/Moscow

RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone

RUN apt update && apt install -y libreoffice-writer libreoffice-impress default-jre
RUN apt-get update && apt-get install -y \
libreoffice-writer \
libreoffice-impress \
default-jre \
tesseract-ocr \
tesseract-ocr-rus

ADD requirements.txt .
RUN python3 -m pip install -r requirements.txt --no-cache-dir
18 changes: 14 additions & 4 deletions app/db/db_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,26 @@ def get_images(check_id):
else:
return None

def save_image_to_db(check_id, image_data, caption, image_size):
def save_image_to_db(check_id, image_data, caption, image_size, text=''):
image = Image({
'check_id': check_id,
'image_data': image_data,
'caption': caption,
'image_size': image_size
'image_size': image_size,
'text' : text
})
images_collection.insert_one(image.pack())
print(str(check_id) + " " + str(caption))
result = images_collection.insert_one(image.pack())
return result.inserted_id

def update_image_text(image_id, new_text):
try:
result = images_collection.update_one(
{'_id': image_id},
{'$set': {'text': new_text}}
)
return result.matched_count > 0
except Exception:
return False

# Returns user if user was created and None if already exists
def add_user(username, password_hash='', is_LTI=False):
Expand Down
2 changes: 2 additions & 0 deletions app/db/db_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,11 +154,13 @@ def __init__(self, dictionary=None):
self.caption = dictionary.get('caption', '') # Подпись к изображению
self.image_data = dictionary.get('image_data') # Файл изображения в формате bindata
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Полезный момент на будущее - добавить checksum на случай дубликатов (чтобы одна одинаковая фотка в 100 отправок / отчетах нам не занимала лишнее место и ресурсы на обработку)

self.image_size = dictionary.get('image_size') # Размер изображения в сантимерах
self.text = dictionary.get('text')

def pack(self):
package = super().pack()
package['check_id'] = str(self.check_id)
package['caption'] = self.caption
package['image_data'] = self.image_data
package['image_size'] = self.image_size
package['text'] = self.text
return package
1 change: 0 additions & 1 deletion app/main/check_packs/pack_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@
["theme_in_report_check"],
['key_words_report_check'],
["empty_task_page_check"],
['image_quality_check'],
]

DEFAULT_TYPE = 'pres'
Expand Down
3 changes: 1 addition & 2 deletions app/main/checks/report_checks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,5 +25,4 @@
from .max_abstract_size_check import ReportMaxSizeOfAbstractCheck
from .template_name import ReportTemplateNameCheck
from .key_words_check import KeyWordsReportCheck
from .empty_task_page_check import EmptyTaskPageCheck
from .image_quality_check import ImageQualityCheck
from .empty_task_page_check import EmptyTaskPageCheck
54 changes: 0 additions & 54 deletions app/main/checks/report_checks/image_quality_check.py
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Dariiiii точно ли тут должен удаляться этот файл (т.е. фактически результат #647)?

This file was deleted.

10 changes: 7 additions & 3 deletions app/main/reports/docx_uploader/docx_uploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,7 @@ def show_chapters(self, work_type):

def extract_images_with_captions(self, check_id):
from app.db.db_methods import save_image_to_db, get_images
from app.tesseract_tasks import tesseract_recognize

emu_to_cm = 360000
image_found = False
Expand Down Expand Up @@ -293,13 +294,16 @@ def extract_images_with_captions(self, check_id):
# Если параграф не содержит изображения и текст не пуст, то это подпись
if not contains_image and next_paragraph_text:
# Сохраняем изображение и его подпись
save_image_to_db(check_id, image_data, next_paragraph_text, (width_cm, height_cm))
image_id = save_image_to_db(check_id, image_data, next_paragraph_text, (width_cm, height_cm))
tesseract_recognize.delay(image_id, image_data)
break
else:
save_image_to_db(check_id, image_data, "picture without caption", (width_cm, height_cm))
image_id = save_image_to_db(check_id, image_data, "picture without caption", (width_cm, height_cm))
tesseract_recognize.delay(image_id, image_data)
break
else:
save_image_to_db(check_id, image_data, "picture without caption", (width_cm, height_cm))
image_id = save_image_to_db(check_id, image_data, "picture without caption", (width_cm, height_cm))
tesseract_recognize.delay(image_id, image_data)

image_found = False # Сброс флага, чтобы искать следующее изображение
image_data = None # Очистка данных изображения
Expand Down
49 changes: 49 additions & 0 deletions app/tesseract_tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import os
from celery import Celery
from celery.signals import worker_ready
import pytesseract
import cv2
import numpy as np
from db import db_methods
from root_logger import get_root_logger

TASK_RETRY_COUNTDOWN = 60
logger = get_root_logger('tesseract_tasks')

celery = Celery(__name__)
celery.conf.broker_url = os.environ.get("CELERY_BROKER_URL", "redis://redis:6379")
celery.conf.result_backend = os.environ.get("CELERY_RESULT_BACKEND", "redis://redis:6379")

celery.conf.timezone = 'Europe/Moscow'

TESSERACT_CONFIG = {
'lang': 'rus',
'config': '--psm 6',
}

@worker_ready.connect
def at_start(sender, **k):
logger.info("Tesseract worker is ready!")


@celery.task(name="tesseract_recognize", queue='tesseract-queue', bind=True)
def tesseract_recognize(self, image_id, image_data):
try:
image_array = np.frombuffer(image_data, dtype=np.uint8)
img_cv = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
if img_cv is None:
raise ValueError("Не удалось декодировать изображение из двоичных данных")
text = pytesseract.image_to_string(img_cv, **TESSERACT_CONFIG)
success = db_methods.update_image_text(image_id, text)
if not success:
logger.error(f"Не удалось записать текст для image_id: {image_id}")
raise Exception("Ошибка при обновлении текста изображения в базе данных")
logger.info(f"Текст успешно распознан и записан для image_id: {image_id}")
return text

except Exception as e:
logger.error(f"Ошибка при распознавании текста: {e}", exc_info=True)
if self.request.retries == self.max_retries:
logger.error(f"Достигнуто максимальное количество попыток для image_id: {image_id}")
return f"Ошибка: {e}"
self.retry(countdown=TASK_RETRY_COUNTDOWN)
17 changes: 17 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,23 @@ services:
volumes:
- ../slides_checker_mongo_data:/data/db
cpuset: ${CONTAINER_CPU:-0-1}

tesseract_worker:
image: document_insight_system_image
restart: always
command: celery --app=app.tasks.celery worker -n tesseract@worker -Q tesseract-queue --loglevel=info
environment:
- CELERY_BROKER_URL=${REDIS_URL}
- CELERY_RESULT_BACKEND=${REDIS_URL}
depends_on:
- redis
- mongodb
volumes:
- presentation_files:/usr/src/project/files/
- "/etc/timezone:/etc/timezone:ro"
- "/etc/localtime:/etc/localtime:ro"
cpuset: ${CONTAINER_CPU:-0-1}
mem_limit: ${WORKER_MEMORY:-1G}

volumes:
flower_data:
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,5 @@ filetype==1.2.0
language-tool-python==2.7.1
markdown==3.4.4
md2pdf==1.0.1
opencv-python==4.5.5.64
opencv-python==4.5.5.64
pytesseract==0.3.10