mindocr/configs/rec/crnn/crnn_resnet34.yaml at 81cd5ca6897ef918bf36983a949e0d0ac5a821c7 · mindspore-lab/mindocr · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
system:
  mode: 0 # 0 for graph mode, 1 for pynative mode in MindSpore
  distribute: True
  amp_level: 'O3'
  seed: 42
  log_interval: 100
  val_while_train: True
  drop_overflow_update: False

common:
  character_dict_path: &character_dict_path  #mindocr/utils/dict/en_dict.txt
  num_classes: &num_classes 37 # num_chars_in_dict+1,  TODO: retreive it from dict or check correctness
  max_text_len: &max_text_len 24
  infer_mode: &infer_mode False
  use_space_char: &use_space_char False
  batch_size: &batch_size 64

model:
  type: rec
  transform: null
  backbone:
    name: rec_resnet34
    pretrained: False
  neck:
    name: RNNEncoder
    hidden_size: 256
  head:
    name: CTCHead
    weight_init: crnn_customised
    bias_init: crnn_customised
    out_channels: *num_classes

postprocess:
  name: RecCTCLabelDecode
  character_dict_path: *character_dict_path
  use_space_char: *use_space_char

metric:
  name: RecMetric
  main_indicator: acc
  character_dict_path: *character_dict_path
  ignore_space: True
  print_flag: False

loss:
  name: CTCLoss
  pred_seq_len: 25 # TODO: retrieve from the network output shape.
  max_label_len: *max_text_len  # this value should be smaller than pre_seq_len
  batch_size: *batch_size

scheduler:
  scheduler: warmup_cosine_decay
  min_lr: 0.0
  lr: 0.0005
  num_epochs: 30
  warmup_epochs: 1
  decay_epochs: 29

optimizer:
  opt: adamw
  filter_bias_and_bn: True
  momentum: 0.95
  weight_decay: 0.0001
  nesterov: False

loss_scaler:
  type: static
  loss_scale: 512

train:
  ckpt_save_dir: './tmp_rec'
  pred_cast_fp32: False # let CTCLoss cast internally
  dataset_sink_mode: False
  dataset:
    type: LMDBDataset
    dataset_root: path/to/data_lmdb_release/ # Optional, if set, dataset_root will be used as a prefix for data_dir
    data_dir: training/
    # label_file: # not required when using LMDBDataset
    sample_ratio: 1.0
    shuffle: True
    transform_pipeline:
      - DecodeImage:
          img_mode: RGB
          to_float32: False
      - RecCTCLabelEncode:
          max_text_len: *max_text_len
          character_dict_path: *character_dict_path
          use_space_char: *use_space_char
          lower: True
      - RecResizeNormImg:
          image_shape: [32, 100] # H, W
          infer_mode: *infer_mode
          character_dict_path: *character_dict_path
          padding: False # aspect ratio will be preserved if true.
          norm_before_pad: False
      - ToCHWImage:
    #  the order of the dataloader list, matching the network input and the input labels for the loss function, and optional data for debug/visaulize
    output_columns: ['image', 'text_seq'] #, 'length'] #'img_path']
    net_input_column_index: [0] # input indices for network forward func in output_columns
    label_column_index: [1] # input indices marked as label
    #keys_for_loss: 4 # num labels for loss func

  loader:
      shuffle: True # TODO: tbc
      batch_size: *batch_size
      drop_remainder: True
      max_rowsize: 12
      num_workers: 8

eval:
  ckpt_load_path: ./tmp_rec/best.ckpt
  dataset_sink_mode: False
  dataset:
    type: LMDBDataset
    dataset_root: path/to/data_lmdb_release/
    data_dir: validation/
    # label_file: # not required when using LMDBDataset
    sample_ratio: 1.0
    shuffle: False
    transform_pipeline:
      - DecodeImage:
          img_mode: RGB
          to_float32: False
      - RecCTCLabelEncode:
          max_text_len: *max_text_len
          character_dict_path: *character_dict_path
          use_space_char: *use_space_char
          lower: True
      - RecResizeNormForInfer:
          target_height: 32
          target_width: 100
          keep_ratio: False
          padding: False
          norm_before_pad: False
      - ToCHWImage:
    #  the order of the dataloader list, matching the network input and the input labels for the loss function, and optional data for debug/visaulize
    output_columns: ['image', 'text_padded', 'text_length']  # TODO return text string padding w/ fixed length, and a scaler to indicate the length
    net_input_column_index: [0] # input indices for network forward func in output_columns
    label_column_index: [1, 2] # input indices marked as label

  loader:
      shuffle: False # TODO: tbc
      batch_size: 64
      drop_remainder: False
      max_rowsize: 12
      num_workers: 8

predict:
  ckpt_load_path: ./tmp_rec/best.ckpt
  vis_font_path: tools/utils/simfang.ttf
  dataset_sink_mode: False
  dataset:
    type: PredictDataset
    dataset_root: path/to/dataset_root
    data_dir: predict_result/crop
    # label_files: # not required when using LMDBDataset
    sample_ratio: 1.0
    shuffle: False
    transform_pipeline:
      - DecodeImage:
          img_mode: BGR
          to_float32: False
#      - RecCTCLabelEncode:
#          max_text_len: *max_text_len
#          character_dict_path: *character_dict_path
#          use_space_char: *use_space_char
#          lower: True
      - RecResizeImg: # different from paddle (paddle converts image from HWC to CHW and rescale to [-1, 1] after resize.
          image_shape: [32, 100] # H, W
          infer_mode: *infer_mode
          character_dict_path: *character_dict_path
          padding: False # aspect ratio will be preserved if true.
      - NormalizeImage:  # different from paddle (paddle wrongly normalize BGR image with RGB mean/std from ImageNet for det, and simple rescale to [-1, 1] in rec.
          bgr_to_rgb: True
          is_hwc: True
          mean : [127.0, 127.0, 127.0]
          std : [127.0, 127.0, 127.0]
      - ToCHWImage:
    #  the order of the dataloader list, matching the network input and the input labels for the loss function, and optional data for debug/visaulize
    output_columns: [ 'img_path', 'image', 'raw_img_shape' ]

  loader:
      shuffle: False # TODO: tbc
      batch_size: 1
      drop_remainder: True
      max_rowsize: 12
      num_workers: 8