Description
Hey, I'm trying to train my custom dataset using mmseg. I'm simply trying to segment the foreground object from the image. I have already trained this model on my own YOLACT repo and it works well there, and I wanted to try out different methods like Deeplabv3+, ocrnet, etc. to see if I can get better results. I started runs for both deeplab and ocrnet and I find a common trend that the model quality becomes worse in later epochs, and I'm not sure why this is happening.
Here's my setup :
So for my custom dataset I first added my dataset class to mmseg like this
from mmseg.datasets.builder import DATASETS
from mmseg.datasets.custom import CustomDataset
import os
classes = ('background', 'foreground')
palette = [[0,0,0], [255,255,255]]
@DATASETS.register_module()
class MeliDataset(CustomDataset):
CLASSES = classes
PALETTE = palette
def __init__(self, split, **kwargs):
super().__init__(img_suffix='.jpg', seg_map_suffix='.png', split=split, reduce_zero_label=False, **kwargs)
assert os.path.exists(self.img_dir) and self.split is not None
and this is my training config for the deeplabv3 run (very similar for ocrnet, I've mostly used the default configs only)
norm_cfg = dict(type='SyncBN', requires_grad=True)
model = dict(
type='EncoderDecoder',
pretrained='open-mmlab://resnet50_v1c',
backbone=dict(
type='ResNetV1c',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
dilations=(1, 1, 2, 4),
strides=(1, 2, 1, 1),
norm_cfg=dict(type='SyncBN', requires_grad=True),
norm_eval=False,
style='pytorch',
contract_dilation=True),
decode_head=dict(
type='DepthwiseSeparableASPPHead',
in_channels=2048,
in_index=3,
channels=512,
dilations=(1, 12, 24, 36),
c1_in_channels=256,
c1_channels=48,
dropout_ratio=0.1,
num_classes=2,
norm_cfg=dict(type='SyncBN', requires_grad=True),
align_corners=False,
loss_decode=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
auxiliary_head=dict(
type='FCNHead',
in_channels=1024,
in_index=2,
channels=256,
num_convs=1,
concat_input=False,
dropout_ratio=0.1,
num_classes=2,
norm_cfg=dict(type='SyncBN', requires_grad=True),
align_corners=False,
loss_decode=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
train_cfg=dict(),
test_cfg=dict(mode='whole'))
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook', by_epoch=True),
dict(type='TensorboardLoggerHook')
])
dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = None
resume_from = None
workflow = [('train', 1)]
cudnn_benchmark = True
dataset_type = 'MeliDataset'
train_data_root = '/media/train_hdd6/tarun/mmseg/meli_iter11/train'
val_data_root = '/media/train_hdd6/tarun/mmseg/meli_iter11/val'
data_root = '/media/train_hdd6/tarun/mmseg/meli_iter11/train'
train_split_file = '/media/train_hdd6/tarun/mmseg/meli_iter11/train.txt'
val_split_file = '/media/train_hdd6/tarun/mmseg/meli_iter11/val.txt'
img_dir = 'images'
ann_dir = 'labels'
img_norm_cfg = dict(
mean=[103.94, 116.78, 123.68], std=[57.38, 57.12, 58.4], to_rgb=True)
crop_size = (512, 512)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', reduce_zero_label=False),
dict(type='Resize', img_scale=(512, 512), ratio_range=(0.5, 2.0)),
dict(type='RandomCrop', crop_size=(512, 512), cat_max_ratio=0.75),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='PhotoMetricDistortion'),
dict(
type='Normalize',
mean=[103.94, 116.78, 123.68],
std=[57.38, 57.12, 58.4],
to_rgb=True),
dict(type='Pad', size=(512, 512), pad_val=0, seg_pad_val=255),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_semantic_seg'])
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(512, 512),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(
type='Normalize',
mean=[103.94, 116.78, 123.68],
std=[57.38, 57.12, 58.4],
to_rgb=True),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
])
]
data = dict(
samples_per_gpu=4,
workers_per_gpu=8,
train=dict(
type='MeliDataset',
data_root='/media/train_hdd6/tarun/mmseg/meli_iter11/train',
img_dir='images',
ann_dir='labels',
pipeline=[
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', reduce_zero_label=False),
dict(type='Resize', img_scale=(512, 512), ratio_range=(0.5, 2.0)),
dict(type='RandomCrop', crop_size=(512, 512), cat_max_ratio=0.75),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='PhotoMetricDistortion'),
dict(
type='Normalize',
mean=[103.94, 116.78, 123.68],
std=[57.38, 57.12, 58.4],
to_rgb=True),
dict(type='Pad', size=(512, 512), pad_val=0, seg_pad_val=255),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_semantic_seg'])
],
split='/media/train_hdd6/tarun/mmseg/meli_iter11/train.txt'),
val=dict(
type='MeliDataset',
data_root='/media/train_hdd6/tarun/mmseg/meli_iter11/val',
img_dir='images',
ann_dir='labels',
pipeline=[
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(512, 512),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(
type='Normalize',
mean=[103.94, 116.78, 123.68],
std=[57.38, 57.12, 58.4],
to_rgb=True),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
])
],
split='/media/train_hdd6/tarun/mmseg/meli_iter11/val.txt'),
test=dict(
type='MeliDataset',
data_root='/media/train_hdd6/tarun/mmseg/meli_iter11/val',
img_dir='images',
ann_dir='labels',
pipeline=[
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(512, 512),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(
type='Normalize',
mean=[103.94, 116.78, 123.68],
std=[57.38, 57.12, 58.4],
to_rgb=True),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
])
],
split='/media/train_hdd6/tarun/mmseg/meli_iter11/val.txt'))
optimizer = dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0005)
optimizer_config = dict()
lr_config = dict(policy='poly', power=0.9, min_lr=0.0001, by_epoch=False)
runner = dict(type='EpochBasedRunner', max_epochs=400)
evaluation = dict(metric='mIoU', pre_eval=True)
checkpoint_config = dict(interval=5, max_keep_ckpts=5)
work_dir = '/media/train_hdd6/tarun/mmseg/dlp_lr/work_dir/'
gpu_ids = range(0, 2)
auto_resume = False
I let it train for about 200 epochs and I tested the intermediate outputs regularly to see whats happening. At epoch 10 my results were something like this
and for the same image at epoch 200 it went back to looking like this
This is just one example of many many other examples, and I'm not sure what's causing this. The only things I changed in the config besides the custom dataset parts, is the crop size value to (512, 512). I've tried reducing my lr to 0.005 from the default 0.01 to see if it helps but it did not. I've also verified that there aren't any inconsistencies on the dataset side.
The training logs also seem fine, there's nothing unusual about the loss/acc curves for both my ocrnet and my deeplabv3+ runs.
I'd really appreciate it if you can help me understand what is causing this issue. I hope I've added enough context here. Thanks in advance!