Skip to content

Commit b0ebe90

Browse files
committed
Adds functionality to add markdown_used
1 parent f391387 commit b0ebe90

6 files changed

Lines changed: 13 additions & 5 deletions

File tree

ocd_backend/enrichers/text_enricher/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from ocd_backend.enrichers import BaseEnricher
1212
from ocd_backend.exceptions import SkipEnrichment
1313
from ocd_backend.log import get_source_logger
14-
from ocd_backend.settings import RESOLVER_BASE_URL, RETRY_MAX_RETRIES, OCR_VERSION
14+
from ocd_backend.settings import RESOLVER_BASE_URL, RETRY_MAX_RETRIES, OCR_VERSION, MARKDOWN_VERSION
1515
from ocd_backend.models.postgres_database import PostgresDatabase
1616
from ocd_backend.models.serializers import PostgresSerializer
1717
from ocd_backend.utils.file_parsing import file_parser, make_temp_pdf_fname, md_file_parser, md_file_parser_using_ocr, parse_result_is_empty, rewrite_problematic_pdfs, force_ocr
@@ -146,7 +146,7 @@ def enrich_item(self, item, metadata):
146146
item.md_text = md_file_parser_using_ocr(path, item.original_url)
147147
ocr_used = OCR_VERSION
148148

149-
ori_document = OriDocument(path, item, ocr_used=ocr_used, metadata=metadata)
149+
ori_document = OriDocument(path, item, ocr_used=ocr_used, markdown_used=MARKDOWN_VERSION, metadata=metadata)
150150
try:
151151
ori_document.store()
152152
except sa.exc.IntegrityError as e:

ocd_backend/models/postgres_models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ class StoredDocument(Base):
2020
content_type = Column(String, nullable=False)
2121
file_size = Column(BigInteger, nullable=False)
2222
ocr_used = Column(String, nullable=True)
23+
markdown_used = Column(String, nullable=True)
2324
created_at = Column(DateTime, nullable=False)
2425
updated_at = Column(DateTime, nullable=False)
2526

ocd_backend/requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@ lxml==5.3.0
99
kombu==5.4.2
1010
# msgpack-python==0.4.2
1111
nose2==0.15.1
12-
pymupdf @ git+https://github.com/openstate/PyMuPDF.git@366458a
12+
pymupdf @ git+https://github.com/openstate/PyMuPDF.git@366458a # When changing this, also change settings.MARKDOWN_VERSION
1313
pdftotext==3.0.0
14-
pymupdf4llm==0.0.24
14+
pymupdf4llm==0.0.24 # When changing this, also change settings.MARKDOWN_VERSION
1515
Pillow==9.3.0
1616
psycopg2==2.9.10
1717
PyLD==2.0.4

ocd_backend/settings.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,7 @@ def get_ocr_version():
286286
# See also requirements.txt
287287
return f"tesserocr==2.7.1,{tesseract_version}"
288288
OCR_VERSION = get_ocr_version()
289+
MARKDOWN_VERSION = f"openstate/PyMuPDF@366458a-pymupdf4llm==0.0.24"
289290

290291
# Allow any settings to be defined in local_settings.py which should be
291292
# ignored in your version control system allowing for settings to be

ocd_backend/utils/ori_document.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
log = get_source_logger('document_storage')
1717

1818
class OriDocument():
19-
def __init__(self, temp_path, item, metadata, ocr_used = None):
19+
def __init__(self, temp_path, item, metadata, ocr_used = None, markdown_used = None):
2020
self.temp_path = temp_path
2121
self.file_name = item.file_name if hasattr(item, 'file_name') else None
2222
self.md_text = item.md_text
@@ -25,6 +25,7 @@ def __init__(self, temp_path, item, metadata, ocr_used = None):
2525
self.last_changed_at = self.get_last_changed_at(item)
2626
self.source, self.supplier = self.get_source_and_supplier(item)
2727
self.ocr_used = ocr_used
28+
self.markdown_used = markdown_used
2829
self.metadata = metadata
2930

3031
self.metadata['content_type'] = item.content_type
@@ -33,6 +34,7 @@ def __init__(self, temp_path, item, metadata, ocr_used = None):
3334
self.metadata['last_changed_at'] = self.last_changed_at.isoformat() if self.last_changed_at else ''
3435
self.metadata['original_url'] = item.original_url if hasattr(item, 'original_url') else ''
3536
self.metadata['ocr_used'] = ocr_used if ocr_used else ''
37+
self.metadata['markdown_used'] = markdown_used if markdown_used else ''
3638

3739
database = PostgresDatabase(serializer=PostgresSerializer)
3840
self.session = database.Session()
@@ -61,6 +63,7 @@ def exists_and_not_changed(self):
6163
if self.stored_document.last_changed_at != self.last_changed_at or \
6264
self.stored_document.file_size != self.file_size or \
6365
self.stored_document.ocr_used != self.ocr_used or \
66+
(self.stored_document.markdown_used and self.stored_document.markdown_used != self.markdown_used) or \
6467
self.stored_document.file_size != self.file_size:
6568
return False
6669

@@ -95,6 +98,7 @@ def store_in_db(self):
9598
self.stored_document.last_changed_at = self.last_changed_at
9699
self.stored_document.file_size = self.file_size
97100
self.stored_document.ocr_used = self.ocr_used
101+
self.stored_document.markdown_used = self.markdown_used
98102
self.stored_document.updated_at = time_now
99103
else:
100104
content_type = magic.from_file(self.temp_path, mime=True)
@@ -107,6 +111,7 @@ def store_in_db(self):
107111
content_type=content_type,
108112
file_size=self.file_size,
109113
ocr_used=self.ocr_used,
114+
markdown_used=self.markdown_used,
110115
created_at=time_now,
111116
updated_at=time_now
112117
)

ocd_frontend/models/postgres_models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ class StoredDocument(Base):
2020
content_type = Column(String, nullable=False)
2121
file_size = Column(BigInteger, nullable=False)
2222
ocr_used = Column(String, nullable=True)
23+
markdown_used = Column(String, nullable=True)
2324
created_at = Column(DateTime, nullable=False)
2425
updated_at = Column(DateTime, nullable=False)
2526

0 commit comments

Comments
 (0)