[Feature] Support lmdb format in Dataset Preparer (#1762)

gaotongxiao · web-flow · commit d56155c82df3 · 2023-03-07T20:08:25.000+08:00
* [Dataset Preparer] Support lmdb format

* fix

* fix

* fix

* fix

* fix

* readme

* readme
diff --git a/dataset_zoo/icdar2015/textrecog.py b/dataset_zoo/icdar2015/textrecog.py
@@ -61,7 +61,9 @@
     parser=dict(type='ICDARTxtTextRecogAnnParser', encoding='utf-8-sig'),
     packer=dict(type='TextRecogPacker'),
     dumper=dict(type='JsonDumper'))
-delete = ['annotations']
+delete = [
+    'annotations', 'ic15_textrecog_train_img_gt', 'ic15_textrecog_test_img'
+]
 config_generator = dict(
     type='TextRecogConfigGenerator',
     test_anns=[
diff --git a/docs/en/user_guides/data_prepare/dataset_preparer.md b/docs/en/user_guides/data_prepare/dataset_preparer.md
@@ -15,11 +15,15 @@ Only one line of command is needed to complete the data download, decompression,
 python tools/dataset_converters/prepare_dataset.py [$DATASET_NAME] --task [$TASK] --nproc [$NPROC]
 ```
 
-| ARGS         | Type | Description                                                                                                                               |
-| ------------ | ---- | ----------------------------------------------------------------------------------------------------------------------------------------- |
-| dataset_name | str  | (required) dataset name.                                                                                                                  |
-| --task       | str  | Convert the dataset to the format of a specified task supported by MMOCR. options are: 'textdet', 'textrecog', 'textspotting', and 'kie'. |
-| --nproc      | int  | Number of processes to be used. Defaults to 4.                                                                                            |
+| ARGS               | Type | Description                                                                                                                               |
+| ------------------ | ---- | ----------------------------------------------------------------------------------------------------------------------------------------- |
+| dataset_name       | str  | (required) dataset name.                                                                                                                  |
+| --nproc            | int  | Number of processes to be used. Defaults to 4.                                                                                            |
+| --task             | str  | Convert the dataset to the format of a specified task supported by MMOCR. options are: 'textdet', 'textrecog', 'textspotting', and 'kie'. |
+| --splits           | str  | Splits of the dataset to be prepared. Multiple splits can be accepted. Defaults to `train val test`.                                      |
+| --lmdb             | str  | Store the data in LMDB format. Only valid when the task is `textrecog`.                                                                   |
+| --overwrite-cfg    | str  | Whether to overwrite the dataset config file if it already exists in `configs/{task}/_base_/datasets`.                                    |
+| --dataset-zoo-path | str  | Path to the dataset config file. If not specified, the default path is `./dataset_zoo`.                                                   |
 
 For example, the following command shows how to use the script to prepare the ICDAR2015 dataset for text detection task.
 
@@ -37,6 +41,44 @@ To check the supported datasets of Dataset Preparer, please refer to [Dataset Zo
 
 ## Advanced Usage
 
+### LMDB Format
+
+In text recognition tasks, we usually use LMDB format to store data to speed up data loading. When using the `prepare_dataset.py` script to prepare data, you can store data to the LMDB format by the `--lmdb` parameter. For example:
+
+```bash
+python tools/dataset_converters/prepare_dataset.py icdar2015 --task textrecog --lmdb
+```
+
+As soon as the dataset is prepared, Dataset Preparer will generate `icdar2015_lmdb.py` in the `configs/textrecog/_base_/datasets/` directory. You can inherit this file and point the `dataloader` to the LMDB dataset. Moreover, the LMDB dataset needs to be loaded by [`LoadImageFromNDArray`](mmocr.datasets.transforms.LoadImageFromNDArray), thus you also need to modify `pipeline`.
+
+For example, if we want to change the training set of `configs/textrecog/crnn/crnn_mini-vgg_5e_mj.py` to icdar2015 generated before, we need to perform the following modifications:
+
+1. Modify `configs/textrecog/crnn/crnn_mini-vgg_5e_mj.py`:
+
+   ```python
+   _base_ = [
+        '../_base_/datasets/icdar2015_lmdb.py',  # point to icdar2015 lmdb dataset
+         ...
+    ]
+
+    train_list = [_base_.icdar2015_lmdb_textrecog_train]
+    ...
+   ```
+
+2. Modify `train_pipeline` in `configs/textrecog/crnn/_base_crnn_mini-vgg.py`, change `LoadImageFromFile` to `LoadImageFromNDArray`:
+
+   ```python
+   train_pipeline = [
+    dict(
+        type='LoadImageFromNDArray',
+        color_type='grayscale',
+        file_client_args=file_client_args,
+        ignore_empty=True,
+        min_size=2),
+    ...
+   ]
+   ```
+
 ### Configuration of Dataset Preparer
 
 Dataset preparer uses a modular design to enhance extensibility, which allows users to extend it to other public or private datasets easily. The configuration files of the dataset preparers are stored in the `dataset_zoo/`, where all the configs of currently supported datasets can be found here. The directory structure is as follows:
@@ -95,6 +137,10 @@ Data:
 
 It is not mandatory to use the metafile in the dataset preparation process (so users can ignore this file when preparing private datasets), but in order to better understand the information of each public dataset, we recommend that users read the metafile before preparing the dataset, which will help to understand whether the datasets meet their needs.
 
+```{warning}
+The following section is outdated as of MMOCR 1.0.0rc6.
+```
+
 #### Config of Dataset Preparer
 
 Next, we will introduce the conventional fields and usage of the dataset preparer configuration files.
@@ -186,7 +232,7 @@ Therefore, we provide two built-in gatherers, `pair_gather` and `mono_gather`, t
 
 When the image and annotation file are matched, the original annotations will be parsed. Since the annotation format is usually varied from dataset to dataset, the parsers are usually dataset related. Then, the parser will pack the required data into the MMOCR format.
 
-Finally, we can specify the dumpers to decide the data format. Currently, we only support `JsonDumper` and `WildreceiptOpensetDumper`, where the former is used to save the data in the standard MMOCR Json format, and the latter is used to save the data in the Wildreceipt format. In the future, we plan to support `LMDBDumper` to save the annotation files in LMDB format.
+Finally, we can specify the dumpers to decide the data format. Currently, we support `JsonDumper`, `WildreceiptOpensetDumper`, and `TextRecogLMDBDumper`. They are used to save the data in the standard MMOCR Json format, Wildreceipt format, and the LMDB format commonly used in academia in the field of text recognition, respectively.
 
 ### Use DataPreparer to prepare customized dataset
 
diff --git a/docs/zh_cn/user_guides/data_prepare/dataset_preparer.md b/docs/zh_cn/user_guides/data_prepare/dataset_preparer.md
@@ -1,4 +1,4 @@
-# 数据准备 (Beta)
+c# 数据准备 (Beta)
 
 ```{note}
 Dataset Preparer 目前仍处在公测阶段，欢迎尝鲜试用！如遇到任何问题，请及时向我们反馈。
@@ -11,16 +11,18 @@ MMOCR 提供了统一的一站式数据集准备脚本 `prepare_dataset.py`。
 仅需一行命令即可完成数据的下载、解压、格式转换，及基础配置的生成。
 
 ```bash
-python tools/dataset_converters/prepare_dataset.py [$DATASET_NAME] [--task $TASK] [--nproc $NPROC] [--overwrite-cfg] [--dataset-zoo-path $DATASET_ZOO_PATH]
+python tools/dataset_converters/prepare_dataset.py [-h] [--nproc NPROC] [--task {textdet,textrecog,textspotting,kie}] [--splits SPLITS [SPLITS ...]] [--lmdb] [--overwrite-cfg] [--dataset-zoo-path DATASET_ZOO_PATH] datasets [datasets ...]
 ```
 
-| 参数               | 类型 | 说明                                                                                                  |
-| ------------------ | ---- | ----------------------------------------------------------------------------------------------------- |
-| dataset_name       | str  | （必须）需要准备的数据集名称。                                                                        |
-| --task             | str  | 将数据集格式转换为指定任务的 MMOCR 格式。可选项为： 'textdet', 'textrecog', 'textspotting' 和 'kie'。 |
-| --nproc            | str  | 使用的进程数，默认为 4。                                                                              |
-| --overwrite-cfg    | str  | 若数据集的基础配置已经在 `configs/{task}/_base_/datasets` 中存在，依然重写该配置                      |
-| --dataset-zoo-path | str  | 存放数据库配置文件的路径。若不指定，则默认为 `./dataset_zoo`                                          |
+| 参数               | 类型                       | 说明                                                                                                  |
+| ------------------ | -------------------------- | ----------------------------------------------------------------------------------------------------- |
+| dataset_name       | str                        | （必须）需要准备的数据集名称。                                                                        |
+| --nproc            | str                        | 使用的进程数，默认为 4。                                                                              |
+| --task             | str                        | 将数据集格式转换为指定任务的 MMOCR 格式。可选项为： 'textdet', 'textrecog', 'textspotting' 和 'kie'。 |
+| --splits           | \['train', 'val', 'test'\] | 希望准备的数据集分割，可以接受多个参数。默认为 `train val test`。                                     |
+| --lmdb             | str                        | 把数据储存为 LMDB 格式，仅当任务为 `textrecog` 时生效。                                               |
+| --overwrite-cfg    | str                        | 若数据集的基础配置已经在 `configs/{task}/_base_/datasets` 中存在，依然重写该配置                      |
+| --dataset-zoo-path | str                        | 存放数据库配置文件的路径。若不指定，则默认为 `./dataset_zoo`                                          |
 
 例如，以下命令展示了如何使用该脚本为 ICDAR2015 数据集准备文本检测任务所需的数据。
 
@@ -38,6 +40,44 @@ python tools/dataset_converters/prepare_dataset.py icdar2015 totaltext --task te
 
 ## 进阶用法
 
+### LMDB 格式
+
+在文本识别任务中，我们通常使用 LMDB 格式来存储数据，以加快数据的读取速度。在使用 `prepare_dataset.py` 脚本准备数据时，可以通过 `--lmdb` 参数来指定将数据转换为 LMDB 格式。例如：
+
+```bash
+python tools/dataset_converters/prepare_dataset.py icdar2015 --task textrecog --lmdb
+```
+
+数据集准备完成后，Dataset Preparer 会在 `configs/textrecog/_base_/datasets/` 中生成 `icdar2015_lmdb.py` 配置。你可以继承该配置，并将 `dataloader` 指向 LMDB 数据集。然而，LMDB 数据集的读取需要配合 [`LoadImageFromNDArray`](mmocr.datasets.transforms.LoadImageFromNDArray)，因此你也同样需要修改 `pipeline`。
+
+例如，我们想要将 `configs/textrecog/crnn/crnn_mini-vgg_5e_mj.py` 的训练集改为刚刚生成的 icdar2015，则需要作如下修改：
+
+1. 修改 `configs/textrecog/crnn/crnn_mini-vgg_5e_mj.py`:
+
+   ```python
+   _base_ = [
+        '../_base_/datasets/icdar2015_lmdb.py',  # 指向 icdar2015 lmdb 数据集
+         ... # 省略
+    ]
+
+    train_list = [_base_.icdar2015_lmdb_textrecog_train]
+    ...
+   ```
+
+2. 修改 `configs/textrecog/crnn/_base_crnn_mini-vgg.py` 中的 `train_pipeline`, 将 `LoadImageFromFile` 改为 `LoadImageFromNDArray`：
+
+   ```python
+   train_pipeline = [
+    dict(
+        type='LoadImageFromNDArray',
+        color_type='grayscale',
+        file_client_args=file_client_args,
+        ignore_empty=True,
+        min_size=2),
+    ...
+   ]
+   ```
+
 ### 数据集配置
 
 数据集自动化准备脚本使用了模块化的设计，极大地增强了扩展性，用户能够很方便地配置其他公开数据集或私有数据集。数据集自动化准备脚本的配置文件被统一存储在 `dataset_zoo/` 目录下，用户可以在该目录下找到所有已由 MMOCR 官方支持的数据集准备脚本配置文件。该文件夹的目录结构如下：
@@ -96,6 +136,10 @@ Data:
 
 该文件在数据集准备过程中并不是强制要求的（因此用户在使用添加自己的私有数据集时可以忽略该文件），但为了用户更好地了解各个公开数据集的信息，我们建议用户在使用数据集准备脚本前阅读对应的元文件信息，以了解该数据集的特征是否符合用户需求。
 
+```{warning}
+自 MMOCR 1.0.0rc6 起，接下来的章节可能会与实际实现有所出入。
+```
+
 #### 数据集准备脚本配置文件
 
 下面，我们将介绍数据集准备脚本配置文件 `textXXX.py` 的默认字段与使用方法。
@@ -235,7 +279,7 @@ OCR 数据集通常有两种标注保存形式，一种为多个标注文件对
 
 ###### `dumper`
 
-之后，我们可以通过指定不同的 dumper 来决定要将数据保存为何种格式。目前，我们仅支持 `JsonDumper` 与 `WildreceiptOpensetDumper`，其中，前者用于将数据保存为标准的 MMOCR Json 格式，而后者用于将数据保存为 Wildreceipt 格式。未来，我们计划支持 `LMDBDumper` 用于保存 LMDB 格式的标注文件。
+之后，我们可以通过指定不同的 dumper 来决定要将数据保存为何种格式。目前，我们支持 `JsonDumper`， `WildreceiptOpensetDumper`，及  `TextRecogLMDBDumper`。他们分别用于将数据保存为标准的 MMOCR Json 格式、Wildreceipt 格式，及文本识别领域学术界常用的 LMDB 格式。
 
 ###### `delete`
 
diff --git a/mmocr/datasets/preparers/config_generators/base.py b/mmocr/datasets/preparers/config_generators/base.py
@@ -81,6 +81,17 @@ def _prepare_anns(self, train_anns: Optional[List[Dict]],
                                  ' None!')
             for ann_dict in ann_list:
                 assert 'ann_file' in ann_dict
+                suffix = ann_dict['ann_file'].split('.')[-1]
+                if suffix == 'json':
+                    dataset_type = 'OCRDataset'
+                elif suffix == 'lmdb':
+                    assert self.task == 'textrecog', \
+                        'LMDB format only works for textrecog now.'
+                    dataset_type = 'RecogLMDBDataset'
+                else:
+                    raise NotImplementedError(
+                        'ann file only supports JSON file or LMDB file')
+                ann_dict['dataset_type'] = dataset_type
                 if ann_dict.get('dataset_postfix', ''):
                     key = f'{self.dataset_name}_{ann_dict["dataset_postfix"]}_{self.task}_{split}'  # noqa
                 else:
diff --git a/mmocr/datasets/preparers/config_generators/textrecog_config_generator.py b/mmocr/datasets/preparers/config_generators/textrecog_config_generator.py
@@ -36,19 +36,38 @@ class TextRecogConfigGenerator(BaseDatasetConfigGenerator):
 
     Example:
         It generates a dataset config like:
-        >>> ic15_rec_data_root = 'data/icdar2015/'
+        >>> icdar2015_textrecog_data_root = 'data/icdar2015/'
         >>> icdar2015_textrecog_train = dict(
         >>>     type='OCRDataset',
-        >>>     data_root=ic15_rec_data_root,
+        >>>     data_root=icdar2015_textrecog_data_root,
         >>>     ann_file='textrecog_train.json',
-        >>>     test_mode=False,
         >>>     pipeline=None)
         >>> icdar2015_textrecog_test = dict(
         >>>     type='OCRDataset',
-        >>>     data_root=ic15_rec_data_root,
+        >>>     data_root=icdar2015_textrecog_data_root,
         >>>     ann_file='textrecog_test.json',
         >>>     test_mode=True,
         >>>     pipeline=None)
+
+        It generates a lmdb format dataset config like:
+        >>> icdar2015_lmdb_textrecog_data_root = 'data/icdar2015'
+        >>> icdar2015_lmdb_textrecog_train = dict(
+        >>>     type='RecogLMDBDataset',
+        >>>     data_root=icdar2015_lmdb_textrecog_data_root,
+        >>>     ann_file='textrecog_train.lmdb',
+        >>>     pipeline=None)
+        >>> icdar2015_lmdb_textrecog_test = dict(
+        >>>     type='RecogLMDBDataset',
+        >>>     data_root=icdar2015_lmdb_textrecog_data_root,
+        >>>     ann_file='textrecog_test.lmdb',
+        >>>     test_mode=True,
+        >>>     pipeline=None)
+        >>> icdar2015_lmdb_1811_textrecog_test = dict(
+        >>>     type='RecogLMDBDataset',
+        >>>     data_root=icdar2015_lmdb_textrecog_data_root,
+        >>>     ann_file='textrecog_test_1811.lmdb',
+        >>>     test_mode=True,
+        >>>     pipeline=None)
     """
 
     def __init__(
@@ -100,8 +119,8 @@ def _gen_dataset_config(self) -> str:
         cfg = ''
         for key_name, ann_dict in self.anns.items():
             cfg += f'\n{key_name} = dict(\n'
-            cfg += '    type=\'OCRDataset\',\n'
-            cfg += '    data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n'  # noqa: E501
+            cfg += f'    type=\'{ann_dict["dataset_type"]}\',\n'
+            cfg += f'    data_root={self.dataset_name}_{self.task}_data_root,\n'  # noqa: E501
             cfg += f'    ann_file=\'{ann_dict["ann_file"]}\',\n'
             if ann_dict['split'] in ['test', 'val']:
                 cfg += '    test_mode=True,\n'
diff --git a/mmocr/datasets/preparers/dumpers/__init__.py b/mmocr/datasets/preparers/dumpers/__init__.py
@@ -1,6 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .base import BaseDumper
 from .json_dumper import JsonDumper
+from .lmdb_dumper import TextRecogLMDBDumper
 from .wild_receipt_openset_dumper import WildreceiptOpensetDumper
 
-__all__ = ['BaseDumper', 'JsonDumper', 'WildreceiptOpensetDumper']
+__all__ = [
+    'BaseDumper', 'JsonDumper', 'WildreceiptOpensetDumper',
+    'TextRecogLMDBDumper'
+]
diff --git a/mmocr/datasets/preparers/dumpers/lmdb_dumper.py b/mmocr/datasets/preparers/dumpers/lmdb_dumper.py
diff --git a/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py b/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py
diff --git a/tools/dataset_converters/prepare_dataset.py b/tools/dataset_converters/prepare_dataset.py