2
2
# Licensed under the MIT License.
3
3
4
4
import asyncio
5
+ import hashlib
5
6
import os
6
7
import re
7
8
import traceback
14
15
Depends ,
15
16
HTTPException ,
16
17
UploadFile ,
18
+ status ,
17
19
)
20
+ from markitdown import MarkItDown , StreamInfo
18
21
19
22
from graphrag_app .logger .load_logger import load_pipeline_logger
20
23
from graphrag_app .typing .models import (
21
24
BaseResponse ,
22
25
StorageNameList ,
23
26
)
24
27
from graphrag_app .utils .common import (
28
+ check_cache ,
29
+ create_cache ,
25
30
delete_cosmos_container_item_if_exist ,
26
31
delete_storage_container_if_exist ,
27
32
get_blob_container_client ,
28
33
get_cosmos_container_store_client ,
29
34
sanitize_name ,
30
35
subscription_key_check ,
36
+ update_cache ,
31
37
)
32
38
33
39
data_route = APIRouter (
42
48
"" ,
43
49
summary = "Get list of data containers." ,
44
50
response_model = StorageNameList ,
45
- responses = {200 : {"model" : StorageNameList }},
51
+ responses = {status . HTTP_200_OK : {"model" : StorageNameList }},
46
52
)
47
53
async def get_all_data_containers ():
48
54
"""
@@ -67,56 +73,66 @@ async def get_all_data_containers():
67
73
return StorageNameList (storage_name = items )
68
74
69
75
70
- async def upload_file_async (
76
+ async def upload_file (
71
77
upload_file : UploadFile , container_client : ContainerClient , overwrite : bool = True
72
- ) -> None :
78
+ ):
73
79
"""
74
- Asynchronously upload a file to the specified blob container.
75
- Silently ignore errors that occur when overwrite=False.
80
+ Convert and upload a file to a specified blob container.
81
+
82
+ Returns a list of objects where each object will have one of the following types:
83
+ * Tuple[str, str] - a tuple of (filename, file_hash) for successful uploads
84
+ * Tuple[str, None] - a tuple of (filename, None) for failed uploads or
85
+ * None for skipped files
76
86
"""
77
- blob_client = container_client .get_blob_client (upload_file .filename )
87
+ filename = upload_file .filename
88
+ extension = os .path .splitext (filename )[1 ]
89
+ converted_filename = filename + ".txt"
90
+ converted_blob_client = container_client .get_blob_client (converted_filename )
91
+
78
92
with upload_file .file as file_stream :
79
93
try :
80
- await blob_client .upload_blob (file_stream , overwrite = overwrite )
94
+ file_hash = hashlib .sha256 (file_stream .read ()).hexdigest ()
95
+ if not await check_cache (file_hash , container_client ):
96
+ # extract text from file using MarkItDown
97
+ md = MarkItDown ()
98
+ stream_info = StreamInfo (
99
+ extension = extension ,
100
+ )
101
+ file_stream ._file .seek (0 )
102
+ file_stream = file_stream ._file
103
+ result = md .convert_stream (
104
+ stream = file_stream ,
105
+ stream_info = stream_info ,
106
+ )
107
+
108
+ # remove illegal unicode characters and upload to blob storage
109
+ cleaned_result = _clean_output (result .text_content )
110
+ await converted_blob_client .upload_blob (
111
+ cleaned_result , overwrite = overwrite
112
+ )
113
+
114
+ # return tuple of (filename, file_hash) to indicate success
115
+ return (filename , file_hash )
81
116
except Exception :
82
- pass
83
-
117
+ # if any exception occurs, return a tuple of (filename, None) to indicate conversion/upload failure
118
+ return ( upload_file . filename , None )
84
119
85
- class Cleaner :
86
- def __init__ (self , file ):
87
- self .file = file
88
- self .name = file .name
89
- self .changes = 0
90
120
91
- def clean (self , val , replacement = "" ):
92
- # fmt: off
93
- _illegal_xml_chars_RE = re .compile (
121
+ def _clean_output (val : str , replacement : str = "" ):
122
+ """Removes unicode characters that are invalid XML characters (not valid for graphml files at least)."""
123
+ # fmt: off
124
+ _illegal_xml_chars_RE = re .compile (
94
125
"[\x00 -\x08 \x0b \x0c \x0e -\x1F \uD800 -\uDFFF \uFFFE \uFFFF ]"
95
126
)
96
- # fmt: on
97
- self .changes += len (_illegal_xml_chars_RE .findall (val ))
98
- return _illegal_xml_chars_RE .sub (replacement , val )
99
-
100
- def read (self , n ):
101
- return self .clean (self .file .read (n ).decode ()).encode (
102
- encoding = "utf-8" , errors = "strict"
103
- )
104
-
105
- def name (self ):
106
- return self .file .name
107
-
108
- def __enter__ (self ):
109
- return self
110
-
111
- def __exit__ (self , * args ):
112
- self .file .close ()
127
+ # fmt: on
128
+ return _illegal_xml_chars_RE .sub (replacement , val )
113
129
114
130
115
131
@data_route .post (
116
132
"" ,
117
133
summary = "Upload data to a data storage container" ,
118
134
response_model = BaseResponse ,
119
- responses = {200 : {"model" : BaseResponse }},
135
+ responses = {status . HTTP_201_CREATED : {"model" : BaseResponse }},
120
136
)
121
137
async def upload_files (
122
138
files : List [UploadFile ],
@@ -125,36 +141,33 @@ async def upload_files(
125
141
overwrite : bool = True ,
126
142
):
127
143
"""
128
- Create a Azure Storage container and upload files to it.
129
-
130
- Args:
131
- files (List[UploadFile]): A list of files to be uploaded.
132
- storage_name (str): The name of the Azure Blob Storage container to which files will be uploaded.
133
- overwrite (bool): Whether to overwrite existing files with the same name. Defaults to True. If False, files that already exist will be skipped.
134
-
135
- Returns:
136
- BaseResponse: An instance of the BaseResponse model with a status message indicating the result of the upload.
137
-
138
- Raises:
139
- HTTPException: If the container name is invalid or if any error occurs during the upload process.
144
+ Create a Azure Storage container (if needed) and upload files. Multiple file types are supported, including pdf, powerpoint, word, excel, html, csv, json, xml, etc.
145
+ The complete set of supported file types can be found in the MarkItDown (https://github.com/microsoft/markitdown) library.
140
146
"""
141
147
try :
142
- # clean files - remove illegal XML characters
143
- files = [UploadFile (Cleaner (f .file ), filename = f .filename ) for f in files ]
144
-
145
- # upload files in batches of 1000 to avoid exceeding Azure Storage API limits
148
+ # create the initial cache if it doesn't exist
146
149
blob_container_client = await get_blob_container_client (
147
150
sanitized_container_name
148
151
)
149
- batch_size = 1000
152
+ await create_cache (blob_container_client )
153
+
154
+ # process file uploads in batches to avoid exceeding Azure Storage API limits
155
+ processing_errors = []
156
+ batch_size = 100
150
157
num_batches = ceil (len (files ) / batch_size )
151
158
for i in range (num_batches ):
152
159
batch_files = files [i * batch_size : (i + 1 ) * batch_size ]
153
160
tasks = [
154
- upload_file_async (file , blob_container_client , overwrite )
161
+ upload_file (file , blob_container_client , overwrite )
155
162
for file in batch_files
156
163
]
157
- await asyncio .gather (* tasks )
164
+ upload_results = await asyncio .gather (* tasks )
165
+ successful_uploads = [r for r in upload_results if r and r [1 ] is not None ]
166
+ # update the file cache with successful uploads
167
+ await update_cache (successful_uploads , blob_container_client )
168
+ # collect failed uploads
169
+ failed_uploads = [r [0 ] for r in upload_results if r and r [1 ] is None ]
170
+ processing_errors .extend (failed_uploads )
158
171
159
172
# update container-store entry in cosmosDB once upload process is successful
160
173
cosmos_container_store_client = get_cosmos_container_store_client ()
@@ -163,17 +176,23 @@ async def upload_files(
163
176
"human_readable_name" : container_name ,
164
177
"type" : "data" ,
165
178
})
166
- return BaseResponse (status = "File upload successful." )
179
+
180
+ if len (processing_errors ) > 0 :
181
+ raise HTTPException (
182
+ status_code = status .HTTP_422_UNPROCESSABLE_ENTITY ,
183
+ detail = f"Error uploading files: { processing_errors } ." ,
184
+ )
185
+ return BaseResponse (status = "Success." )
167
186
except Exception as e :
168
187
logger = load_pipeline_logger ()
169
188
logger .error (
170
189
message = "Error uploading files." ,
171
190
cause = e ,
172
191
stack = traceback .format_exc (),
173
- details = {"files" : [ f . filename for f in files ] },
192
+ details = {"files" : processing_errors },
174
193
)
175
194
raise HTTPException (
176
- status_code = 500 ,
195
+ status_code = status . HTTP_500_INTERNAL_SERVER_ERROR ,
177
196
detail = f"Error uploading files to container '{ container_name } '." ,
178
197
)
179
198
@@ -182,7 +201,7 @@ async def upload_files(
182
201
"/{container_name}" ,
183
202
summary = "Delete a data storage container" ,
184
203
response_model = BaseResponse ,
185
- responses = {200 : {"model" : BaseResponse }},
204
+ responses = {status . HTTP_200_OK : {"model" : BaseResponse }},
186
205
)
187
206
async def delete_files (
188
207
container_name : str , sanitized_container_name : str = Depends (sanitize_name )
0 commit comments