@@ -36,6 +36,13 @@ def parse_args():
36
36
help = 'Whether to dump the textrecog dataset to LMDB format, It\' s a '
37
37
'shortcut to force the dataset to be dumped in lmdb format. '
38
38
'Applicable when --task=textrecog' )
39
+ parser .add_argument (
40
+ '--huggingface' ,
41
+ action = 'store_true' ,
42
+ default = False ,
43
+ help = 'Whether to dump the ser/re dataset to huggingface format,'
44
+ 'It\' s a shortcut to force the dataset to be dumped in huggingface '
45
+ 'format. Applicable when --task=ser or re' )
39
46
parser .add_argument (
40
47
'--overwrite-cfg' ,
41
48
action = 'store_true' ,
@@ -124,10 +131,56 @@ def force_lmdb(cfg):
124
131
return cfg
125
132
126
133
134
+ def force_huggingface (cfg ):
135
+ """Force the dataset to be dumped in huggingface format.
136
+
137
+ Args:
138
+ cfg (Config): Config object.
139
+
140
+ Returns:
141
+ Config: Config object.
142
+ """
143
+ for split in ['train' , 'val' , 'test' ]:
144
+ preparer_cfg = cfg .get (f'{ split } _preparer' )
145
+ if preparer_cfg :
146
+ if preparer_cfg .get ('dumper' ) is None :
147
+ raise ValueError (
148
+ f'{ split } split does not come with a dumper, '
149
+ 'so most likely the annotations are MMOCR-ready and do '
150
+ 'not need any adaptation, and it '
151
+ 'cannot be dumped in LMDB format.' )
152
+ preparer_cfg .dumper ['type' ] = 'HuggingfaceDumper'
153
+
154
+ cfg .config_generator ['dataset_name' ] = f'{ cfg .dataset_name } _huggingface'
155
+
156
+ for split in ['train_anns' , 'val_anns' , 'test_anns' ]:
157
+ if split in cfg .config_generator :
158
+ # It can be None when users want to clear out the default
159
+ # value
160
+ if not cfg .config_generator [split ]:
161
+ continue
162
+ ann_list = cfg .config_generator [split ]
163
+ for ann_dict in ann_list :
164
+ ann_dict ['ann_file' ] = (
165
+ osp .splitext (ann_dict ['ann_file' ])[0 ] + '.huggingface' )
166
+ else :
167
+ if split == 'train_anns' :
168
+ ann_list = [dict (ann_file = f'{ cfg .task } _train.huggingface' )]
169
+ elif split == 'test_anns' :
170
+ ann_list = [dict (ann_file = f'{ cfg .task } _test.huggingface' )]
171
+ else :
172
+ ann_list = []
173
+ cfg .config_generator [split ] = ann_list
174
+
175
+ return cfg
176
+
177
+
127
178
def main ():
128
179
args = parse_args ()
129
180
if args .lmdb and args .task != 'textrecog' :
130
181
raise ValueError ('--lmdb only works with --task=textrecog' )
182
+ if args .huggingface and args .task not in ['ser' , 're' ]:
183
+ raise ValueError ('--huggingface only works with --task=ser or re' )
131
184
for dataset in args .datasets :
132
185
if not osp .isdir (osp .join (args .dataset_zoo_path , dataset )):
133
186
warnings .warn (f'{ dataset } is not supported yet. Please check '
@@ -145,6 +198,8 @@ def main():
145
198
cfg .dataset_name = dataset
146
199
if args .lmdb :
147
200
cfg = force_lmdb (cfg )
201
+ if args .huggingface :
202
+ cfg = force_huggingface (cfg )
148
203
preparer = DatasetPreparer .from_file (cfg )
149
204
preparer .run (args .splits )
150
205
0 commit comments