Skip to content

Commit d90663b

Browse files
committed
Merge branch 'layoutlm' of https://github.com/KevinNuNu/mmocr into layoutlm
2 parents 1d0c5e3 + 106fcb9 commit d90663b

File tree

7 files changed

+107
-7
lines changed

7 files changed

+107
-7
lines changed

mmocr/datasets/preparers/config_generators/base.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -83,11 +83,18 @@ def _prepare_anns(self, train_anns: Optional[List[Dict]],
8383
assert 'ann_file' in ann_dict
8484
suffix = ann_dict['ann_file'].split('.')[-1]
8585
if suffix == 'json':
86-
dataset_type = 'OCRDataset'
86+
if self.task in ['ser', 're']:
87+
dataset_type = f'{self.task.upper()}Dataset'
88+
else:
89+
dataset_type = 'OCRDataset'
8790
elif suffix == 'lmdb':
8891
assert self.task == 'textrecog', \
8992
'LMDB format only works for textrecog now.'
9093
dataset_type = 'RecogLMDBDataset'
94+
elif suffix == 'huggingface':
95+
assert self.task in ['ser', 're'], \
96+
'Huggingface format only works for ser or re now.'
97+
dataset_type = f'{self.task.upper()}HuggingfaceDataset'
9198
else:
9299
raise NotImplementedError(
93100
'ann file only supports JSON file or LMDB file')

mmocr/datasets/preparers/config_generators/re_config_generator.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,8 @@ def _gen_dataset_config(self) -> str:
8787
cfg = ''
8888
for key_name, ann_dict in self.anns.items():
8989
cfg += f'\n{key_name} = dict(\n'
90-
cfg += ' type=\'REDataset\',\n'
91-
cfg += ' data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n' # noqa: E501
90+
cfg += f' type=\'{ann_dict["dataset_type"]}\',\n'
91+
cfg += f' data_root={self.dataset_name}_{self.task}_data_root,\n' # noqa: E501
9292
cfg += f' ann_file=\'{ann_dict["ann_file"]}\',\n'
9393
if ann_dict['split'] in ['test', 'val']:
9494
cfg += ' test_mode=True,\n'

mmocr/datasets/preparers/config_generators/ser_config_generator.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,8 @@ def _gen_dataset_config(self) -> str:
8787
cfg = ''
8888
for key_name, ann_dict in self.anns.items():
8989
cfg += f'\n{key_name} = dict(\n'
90-
cfg += ' type=\'SERDataset\',\n'
91-
cfg += ' data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n' # noqa: E501
90+
cfg += f' type=\'{ann_dict["dataset_type"]}\',\n'
91+
cfg += f' data_root={self.dataset_name}_{self.task}_data_root,\n' # noqa: E501
9292
cfg += f' ann_file=\'{ann_dict["ann_file"]}\',\n'
9393
if ann_dict['split'] in ['test', 'val']:
9494
cfg += ' test_mode=True,\n'

mmocr/datasets/preparers/config_generators/textdet_config_generator.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ def _gen_dataset_config(self) -> str:
8686
for key_name, ann_dict in self.anns.items():
8787
cfg += f'\n{key_name} = dict(\n'
8888
cfg += ' type=\'OCRDataset\',\n'
89-
cfg += ' data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n' # noqa: E501
89+
cfg += f' data_root={self.dataset_name}_{self.task}_data_root,\n' # noqa: E501
9090
cfg += f' ann_file=\'{ann_dict["ann_file"]}\',\n'
9191
if ann_dict['split'] == 'train':
9292
cfg += ' filter_cfg=dict(filter_empty_gt=True, min_size=32),\n' # noqa: E501
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
# Copyright (c) OpenMMLab. All rights reserved.
22
from .base import BaseDumper
3+
from .huggingface_dumper import HuggingfaceDumper
34
from .json_dumper import JsonDumper
45
from .lmdb_dumper import TextRecogLMDBDumper
56
from .wild_receipt_openset_dumper import WildreceiptOpensetDumper
67

78
__all__ = [
89
'BaseDumper', 'JsonDumper', 'WildreceiptOpensetDumper',
9-
'TextRecogLMDBDumper'
10+
'TextRecogLMDBDumper', 'HuggingfaceDumper'
1011
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Copyright (c) OpenMMLab. All rights reserved.
2+
import os.path as osp
3+
from collections import defaultdict
4+
from typing import Dict
5+
6+
from datasets import Dataset, Image
7+
8+
from mmocr.registry import DATA_DUMPERS
9+
from .base import BaseDumper
10+
11+
12+
@DATA_DUMPERS.register_module()
13+
class HuggingfaceDumper(BaseDumper):
14+
"""Semantic Entity Recognition and Relation Extraction huggingface datasets
15+
format dumper."""
16+
17+
def dump(self, data: Dict) -> None:
18+
"""Dump data to datasets format to disk.
19+
20+
Args:
21+
data (Dict): MMOCR format data to be dumped.
22+
"""
23+
data_list = data.get('data_list', None)
24+
filename = f'{self.task}_{self.split}.huggingface'
25+
dst_file = osp.join(self.data_root, filename)
26+
27+
merged_dict = defaultdict(list)
28+
for d in data_list:
29+
instances = d['instances']
30+
img_path = osp.join(self.data_root, d['img_path'])
31+
merged_dict['image'].append(img_path)
32+
for k, v in instances.items():
33+
merged_dict[k].append(v)
34+
ds = Dataset.from_dict(merged_dict)
35+
ds = ds.cast_column('image', Image())
36+
# save to disk
37+
ds.save_to_disk(dst_file)

tools/dataset_converters/prepare_dataset.py

+55
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,13 @@ def parse_args():
3636
help='Whether to dump the textrecog dataset to LMDB format, It\'s a '
3737
'shortcut to force the dataset to be dumped in lmdb format. '
3838
'Applicable when --task=textrecog')
39+
parser.add_argument(
40+
'--huggingface',
41+
action='store_true',
42+
default=False,
43+
help='Whether to dump the ser/re dataset to huggingface format,'
44+
'It\'s a shortcut to force the dataset to be dumped in huggingface '
45+
'format. Applicable when --task=ser or re')
3946
parser.add_argument(
4047
'--overwrite-cfg',
4148
action='store_true',
@@ -124,10 +131,56 @@ def force_lmdb(cfg):
124131
return cfg
125132

126133

134+
def force_huggingface(cfg):
135+
"""Force the dataset to be dumped in huggingface format.
136+
137+
Args:
138+
cfg (Config): Config object.
139+
140+
Returns:
141+
Config: Config object.
142+
"""
143+
for split in ['train', 'val', 'test']:
144+
preparer_cfg = cfg.get(f'{split}_preparer')
145+
if preparer_cfg:
146+
if preparer_cfg.get('dumper') is None:
147+
raise ValueError(
148+
f'{split} split does not come with a dumper, '
149+
'so most likely the annotations are MMOCR-ready and do '
150+
'not need any adaptation, and it '
151+
'cannot be dumped in LMDB format.')
152+
preparer_cfg.dumper['type'] = 'HuggingfaceDumper'
153+
154+
cfg.config_generator['dataset_name'] = f'{cfg.dataset_name}_huggingface'
155+
156+
for split in ['train_anns', 'val_anns', 'test_anns']:
157+
if split in cfg.config_generator:
158+
# It can be None when users want to clear out the default
159+
# value
160+
if not cfg.config_generator[split]:
161+
continue
162+
ann_list = cfg.config_generator[split]
163+
for ann_dict in ann_list:
164+
ann_dict['ann_file'] = (
165+
osp.splitext(ann_dict['ann_file'])[0] + '.huggingface')
166+
else:
167+
if split == 'train_anns':
168+
ann_list = [dict(ann_file=f'{cfg.task}_train.huggingface')]
169+
elif split == 'test_anns':
170+
ann_list = [dict(ann_file=f'{cfg.task}_test.huggingface')]
171+
else:
172+
ann_list = []
173+
cfg.config_generator[split] = ann_list
174+
175+
return cfg
176+
177+
127178
def main():
128179
args = parse_args()
129180
if args.lmdb and args.task != 'textrecog':
130181
raise ValueError('--lmdb only works with --task=textrecog')
182+
if args.huggingface and args.task not in ['ser', 're']:
183+
raise ValueError('--huggingface only works with --task=ser or re')
131184
for dataset in args.datasets:
132185
if not osp.isdir(osp.join(args.dataset_zoo_path, dataset)):
133186
warnings.warn(f'{dataset} is not supported yet. Please check '
@@ -145,6 +198,8 @@ def main():
145198
cfg.dataset_name = dataset
146199
if args.lmdb:
147200
cfg = force_lmdb(cfg)
201+
if args.huggingface:
202+
cfg = force_huggingface(cfg)
148203
preparer = DatasetPreparer.from_file(cfg)
149204
preparer.run(args.splits)
150205

0 commit comments

Comments
 (0)