moevm · Dariiiii · Sep 27, 2024 · Sep 29, 2024 · Sep 30, 2024 · Feb 6, 2025
diff --git a/Dockerfile_base b/Dockerfile_base
@@ -8,7 +8,12 @@ ENV TZ=Europe/Moscow
 
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
 
-RUN apt update && apt install -y libreoffice-writer libreoffice-impress default-jre
+RUN apt-get update &&  apt-get install -y \
+    libreoffice-writer \
+    libreoffice-impress \
+    default-jre \
+    tesseract-ocr \
+    tesseract-ocr-rus
 
 ADD requirements.txt .
 RUN python3 -m pip install -r requirements.txt --no-cache-dir
diff --git a/app/db/db_methods.py b/app/db/db_methods.py
@@ -7,7 +7,7 @@
 from pymongo import MongoClient
 from utils import convert_to
 
-from .db_types import User, Presentation, Check, Consumers, Logs
+from .db_types import User, Presentation, Check, Consumers, Logs, Image
 
 client = MongoClient("mongodb://mongodb:27017")
 db = client['pres-parser-db']
@@ -21,11 +21,55 @@
 logs_collection = db.create_collection(
     'logs', capped=True, size=5242880) if not db['logs'] else db['logs']
 celery_check_collection = db['celery_check']  # collection for mapping celery_task to check
+images_collection = db['images']  # коллекция для хранения изображений
 
 
 def get_client():
     return client
 
+def get_images(check_id):
+    images = images_collection.find({'check_id': str(check_id)})
+    if images is not None:
+        image_list = []
+        for img in images:
+            image_list.append(Image(img))
+        return image_list
+    else:
+        return None
+
+def save_image_to_db(check_id, image_data, caption, image_size, text=None, tesseract_task_id=None):
+    image = Image({
+        'check_id': check_id,
+        'image_data': image_data,
+        'caption': caption,
+        'image_size': image_size,
+        'text' : text,
+        'tesseract_task_id': tesseract_task_id
+    })
+    result = images_collection.insert_one(image.pack())
+    return result.inserted_id 
+
+def add_image_text(tesseract_task_id, new_text):
+    result = images_collection.update_one(
+        {'tesseract_task_id': tesseract_task_id},
+        {'$set': {'text': new_text}}
+    )
+    return result.matched_count > 0
+
+def add_tesseract_task_id(image_id, tesseract_task_id):
+    result = images_collection.update_one(
+        {'_id': image_id},
+        {'$set': {'tesseract_task_id': tesseract_task_id}}
+    )
+    return result.matched_count > 0
+
+def get_tesseract_task_id(image_id):
+    image = images_collection.find_one({'_id': image_id})
+    if image:
+        return image.get('tesseract_task_id')
+    else:
+        return None
+
 
 # Returns user if user was created and None if already exists
 def add_user(username, password_hash='', is_LTI=False):

diff --git a/app/db/db_types.py b/app/db/db_types.py
@@ -145,3 +145,24 @@ def none_to_false(x):
         is_ended = none_to_true(self.is_ended)  # None for old checks => True, True->True, False->False
         is_failed = none_to_false(self.is_failed)  # None for old checks => False, True->True, False->False
         return {'is_ended': is_ended, 'is_failed': is_failed}
+
+class Image(PackableWithId):
+    def __init__(self, dictionary=None):
+        super().__init__(dictionary)
+        dictionary = dictionary or {}
+        self.check_id = dictionary.get('check_id')  # Привязка к check_id
+        self.caption = dictionary.get('caption', '')  # Подпись к изображению
+        self.image_data = dictionary.get('image_data')  # Файл изображения в формате bindata
+        self.image_size = dictionary.get('image_size')  # Размер изображения в сантимерах
+        self.text = dictionary.get('text')
+        self.tesseract_task_id = dictionary.get('tesseract_task_id')
+
+    def pack(self):
+        package = super().pack()
+        package['check_id'] = str(self.check_id)
+        package['caption'] = self.caption
+        package['image_data'] = self.image_data
+        package['image_size'] = self.image_size
+        package['text'] = self.text
+        package['tesseract_task_id'] = self.tesseract_task_id
+        return package
diff --git a/app/main/check_packs/pack_config.py b/app/main/check_packs/pack_config.py
@@ -46,6 +46,7 @@
     ["theme_in_report_check"],
     ['key_words_report_check'],
     ["empty_task_page_check"],
+    ["image_text_check"],
 ]
 
 DEFAULT_TYPE = 'pres'

diff --git a/app/main/checks/report_checks/__init__.py b/app/main/checks/report_checks/__init__.py
@@ -26,3 +26,4 @@
 from .template_name import ReportTemplateNameCheck
 from .key_words_check import KeyWordsReportCheck
 from .empty_task_page_check import EmptyTaskPageCheck
+from .image_text_check import ImageTextCheck
diff --git a/app/main/checks/report_checks/image_text_check.py b/app/main/checks/report_checks/image_text_check.py
@@ -0,0 +1,77 @@
+import re
+from ..base_check import BaseReportCriterion, answer
+import time
+from celery.result import AsyncResult
+
+class ImageTextCheck(BaseReportCriterion):
+    label = "Проверка текста, считанного с изображений"
+    description = ''
+    id = 'image_text_check'
+    # Подобрать значения для symbols_set, max_symbols_percentage, max_text_density
+    def __init__(self, file_info, symbols_set=['%', '1'], max_symbols_percentage=0, max_text_density=0, max_wait_time=30):
+        super().__init__(file_info)
+        self.images = self.file.images
+        self.symbols_set = symbols_set
+        self.max_symbols_percentage = max_symbols_percentage
+        self.max_text_density = max_text_density
+        self.max_wait_time = max_wait_time
+
+    def check(self):
+        deny_list = []
+        if self.images:
+            for image in self.images:
+                if image.text == '':
+                    continue
+                recognized_text = self.wait_for_text_recognition(image)
+                width, height = image.image_size
+                if not recognized_text:
+                    continue
+                text_density = self.calculate_text_density(recognized_text, width, height)
+                if text_density > self.max_text_density:
+                    deny_list.append(
+                        f"Изображение с подписью '{image.caption}' имеет слишком высокую плотность текста: "
+                        f"{text_density:.4f} (максимум {self.max_text_density}). Это может означать, что текст нечитаем.<br>"
+                    )
+                symbols_count = self.count_symbols_in_text(recognized_text, self.symbols_set)
+                text_length = len(recognized_text)
+                symbols_percentage = (symbols_count / text_length) * 100
+                if symbols_percentage > self.max_symbols_percentage:
+                    deny_list.append(
+                        f"На изображении с подписью '{image.caption}' содержится слишком много неверно распознанных символов: "
+                        f"{symbols_percentage:.2f}% (максимум {self.max_symbols_percentage}%). Это может означать, что размер шрифта слишком маленький или текст нечитаем.<br>"
+                    )
+        else:
+            return answer(False, 'Изображения не найдены!')
+        if deny_list:
+            return answer(False, f'Проблемы с текстом на изображениях! <br>{"".join(deny_list)}')
+        else:
+            return answer(True, 'Текст на изображениях корректен!')
+
+    def count_symbols_in_text(self, text, symbols_set):
+        return sum(1 for char in text if char in symbols_set)
+
+    def calculate_text_density(self, text, width, height):
+        text_without_spaces = ''.join(text.split())
+        image_area = width * height
+        if image_area == 0:
+            return 0
+        return len(text_without_spaces) / image_area
+
+    def wait_for_text_recognition(self, image):
+        from app.db.db_methods import add_image_text
+        start_time = time.time()
+        task_id = image.tesseract_task_id
+        if not task_id:
+            return None
+
+        while time.time() - start_time < self.max_wait_time:
+            task_result = AsyncResult(task_id)
+            if task_result.state == 'SUCCESS':
+                recognized_text = task_result.result
+                recognized_text = re.sub(r'\s+', ' ', recognized_text)
+                image.text = recognized_text
+                add_image_text(task_id, recognized_text)
+                return recognized_text.strip()
+            time.sleep(1)
+
+        return None
diff --git a/app/main/parser.py b/app/main/parser.py
@@ -8,18 +8,39 @@
 from main.reports.md_uploader import MdUploader
 from utils import convert_to
 
-logger = logging.getLogger('root_logger')
+from os.path import basename
+from app.db.db_methods import add_check
+from app.db.db_types import Check
 
+logger = logging.getLogger('root_logger')
 
 def parse(filepath, pdf_filepath):
+    from app.db.db_methods import files_info_collection
+
     tmp_filepath = filepath.lower()
     try:
         if tmp_filepath.endswith(('.odp', '.ppt', '.pptx')):
             new_filepath = filepath
             if tmp_filepath.endswith(('.odp', '.ppt')):
                 logger.info(f"Презентация {filepath} старого формата. Временно преобразована в pptx для обработки.")
                 new_filepath = convert_to(filepath, target_format='pptx')
-            file_object = PresentationPPTX(new_filepath)
+
+            presentation = PresentationPPTX(new_filepath)
+
+            check = Check({
+                'filename': basename(new_filepath),
+            })
+
+            file_id = 0
+            file = files_info_collection.find_one({'name': basename(new_filepath)})
+            if file:
+                file_id = file['_id']
+
+            check_id = add_check(file_id, check)
+            presentation.extract_images_with_captions(check_id)
+            file_object = presentation
+
+
         elif tmp_filepath.endswith(('.doc', '.odt', '.docx', )):
             new_filepath = filepath
             if tmp_filepath.endswith(('.doc', '.odt')):
@@ -28,7 +49,19 @@ def parse(filepath, pdf_filepath):
 
             docx = DocxUploader()
             docx.upload(new_filepath, pdf_filepath)
+
+            check = Check({
+                'filename': basename(new_filepath),
+            })
+
+            file_id = 0
+            file = files_info_collection.find_one({'name': basename(new_filepath)})
+            if file:
+                file_id = file['_id']
+
+            check_id = add_check(file_id, check)
             docx.parse()
+            docx.extract_images_with_captions(check_id)
             file_object = docx
 
         elif tmp_filepath.endswith('.md' ):
@@ -54,4 +87,4 @@ def save_to_temp_file(file):
     temp_file.write(file.read())
     temp_file.close()
     file.seek(0)
-    return temp_file.name
+    return temp_file.name
diff --git a/app/main/presentations/pptx/presentation_pptx.py b/app/main/presentations/pptx/presentation_pptx.py
@@ -1,4 +1,7 @@
+from io import BytesIO
+
 from pptx import Presentation
+from pptx.enum.shapes import MSO_SHAPE_TYPE
 
 from .slide_pptx import SlidePPTX
 from ..presentation_basic import PresentationBasic
@@ -17,3 +20,39 @@ def add_slides(self):
 
     def __str__(self):
         return super().__str__()
+
+    def extract_images_with_captions(self, check_id):
+        from app.db.db_methods import save_image_to_db
+
+        # Проход по каждому слайду в презентации
+        for slide in self.slides:
+            image_found = False
+            image_data = None
+            caption_text = None
+
+            # Проход по всем фигурам на слайде
+            for shape in slide.slide.shapes:  # Используем slide.slide для доступа к текущему слайду
+                if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
+                    image_found = True
+                    image_part = shape.image  # Получаем объект изображения
+
+                    # Извлекаем бинарные данные изображения
+                    image_stream = image_part.blob
+                    image_data = BytesIO(image_stream)
+
+                # Если мы нашли изображение, ищем следующий непустой текст как подпись
+                if image_found:
+                    for shape in slide.slide.shapes:
+                        if not shape.has_text_frame:
+                            continue
+                        text = shape.text.strip()
+                        if text:  # Находим непустое текстовое поле (предположительно, это подпись)
+                            caption_text = text
+                            # Сохраняем изображение и его подпись
+                            save_image_to_db(check_id, image_data.getvalue(), caption_text)
+                            break  # Предполагаем, что это подпись к текущему изображению
+
+                    # Сброс флага и данных изображения для следующего цикла
+                    image_found = False
+                    image_data = None
+                    caption_text = None
diff --git a/app/main/reports/document_uploader.py b/app/main/reports/document_uploader.py
@@ -12,6 +12,7 @@ def __init__(self):
         self.literature_page = 0
         self.first_lines = []
         self.page_count = 0
+        self.images = []
 
     @abstractmethod
     def upload(self):

diff --git a/app/main/reports/docx_uploader/docx_uploader.py b/app/main/reports/docx_uploader/docx_uploader.py
@@ -12,6 +12,7 @@
 from ..document_uploader import DocumentUploader
 
 
+
 class DocxUploader(DocumentUploader):
     def __init__(self):
         super().__init__()
@@ -242,6 +243,52 @@ def show_chapters(self, work_type):
                 chapters_str += "&nbsp;&nbsp;&nbsp;&nbsp;" + header["text"] + "<br>"
         return chapters_str
 
+    def extract_images_with_captions(self, check_id):
+        from app.db.db_methods import save_image_to_db, get_images, add_tesseract_task_id
+        from app.tesseract_tasks import tesseract_recognize
+
+        emu_to_cm  = 360000
+        image_found = False
+        image_data = None
+        if not self.images:
+            for i, paragraph in enumerate(self.file.paragraphs):
+                for run in paragraph.runs:
+                    if "graphic" in run._element.xml:
+                        image_streams = run._element.findall('.//a:blip', namespaces={
+                            'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'})
+                        for image_stream in image_streams:
+                            embed_id = image_stream.get(
+                                '{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
+                            if embed_id:
+                                image_found = True
+                                image_part = self.file.part.related_parts[embed_id]
+                                image_data = image_part.blob
+                                extent = run._element.find('.//wp:extent', namespaces={
+                                'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
+                                width_cm = height_cm = None
+                                if extent is not None:
+                                    width_cm = int(extent.get('cx')) / emu_to_cm
+                                    height_cm = int(extent.get('cy')) / emu_to_cm
+                    if image_found:
+                        caption = "picture without caption"
+                        next_paragraph_index = i + 1
+                        while next_paragraph_index < len(self.file.paragraphs):
+                            next_paragraph = self.file.paragraphs[next_paragraph_index]
+                            next_text = next_paragraph.text.strip()
+                            if next_text and not any("graphic" in r._element.xml for r in next_paragraph.runs):
+                                caption = next_text
+                                break
+                            next_paragraph_index += 1
+
+                        image_id = save_image_to_db(check_id, image_data, caption, (width_cm, height_cm))
+                        task = tesseract_recognize.delay(str(image_id), image_data)
+                        add_tesseract_task_id(image_id, task.id)
+                        image_found = False
+                        image_data = None 
+
+            self.images = get_images(check_id)
+
+
 
 def main(args):
     file = args.file