1616log = get_source_logger ('document_storage' )
1717
1818class OriDocument ():
19- def __init__ (self , temp_path , item , metadata , ocr_used = None ):
19+ def __init__ (self , temp_path , item , metadata , ocr_used = None , markdown_used = None ):
2020 self .temp_path = temp_path
2121 self .file_name = item .file_name if hasattr (item , 'file_name' ) else None
2222 self .md_text = item .md_text
@@ -25,6 +25,7 @@ def __init__(self, temp_path, item, metadata, ocr_used = None):
2525 self .last_changed_at = self .get_last_changed_at (item )
2626 self .source , self .supplier = self .get_source_and_supplier (item )
2727 self .ocr_used = ocr_used
28+ self .markdown_used = markdown_used
2829 self .metadata = metadata
2930
3031 self .metadata ['content_type' ] = item .content_type
@@ -33,6 +34,7 @@ def __init__(self, temp_path, item, metadata, ocr_used = None):
3334 self .metadata ['last_changed_at' ] = self .last_changed_at .isoformat () if self .last_changed_at else ''
3435 self .metadata ['original_url' ] = item .original_url if hasattr (item , 'original_url' ) else ''
3536 self .metadata ['ocr_used' ] = ocr_used if ocr_used else ''
37+ self .metadata ['markdown_used' ] = markdown_used if markdown_used else ''
3638
3739 database = PostgresDatabase (serializer = PostgresSerializer )
3840 self .session = database .Session ()
@@ -61,6 +63,7 @@ def exists_and_not_changed(self):
6163 if self .stored_document .last_changed_at != self .last_changed_at or \
6264 self .stored_document .file_size != self .file_size or \
6365 self .stored_document .ocr_used != self .ocr_used or \
66+ (self .stored_document .markdown_used and self .stored_document .markdown_used != self .markdown_used ) or \
6467 self .stored_document .file_size != self .file_size :
6568 return False
6669
@@ -95,6 +98,7 @@ def store_in_db(self):
9598 self .stored_document .last_changed_at = self .last_changed_at
9699 self .stored_document .file_size = self .file_size
97100 self .stored_document .ocr_used = self .ocr_used
101+ self .stored_document .markdown_used = self .markdown_used
98102 self .stored_document .updated_at = time_now
99103 else :
100104 content_type = magic .from_file (self .temp_path , mime = True )
@@ -107,6 +111,7 @@ def store_in_db(self):
107111 content_type = content_type ,
108112 file_size = self .file_size ,
109113 ocr_used = self .ocr_used ,
114+ markdown_used = self .markdown_used ,
110115 created_at = time_now ,
111116 updated_at = time_now
112117 )
0 commit comments