Skip to content

Model performance regresses at later epochs #1557

Open
@tarunn2799

Description

@tarunn2799

Hey, I'm trying to train my custom dataset using mmseg. I'm simply trying to segment the foreground object from the image. I have already trained this model on my own YOLACT repo and it works well there, and I wanted to try out different methods like Deeplabv3+, ocrnet, etc. to see if I can get better results. I started runs for both deeplab and ocrnet and I find a common trend that the model quality becomes worse in later epochs, and I'm not sure why this is happening.

Here's my setup :

So for my custom dataset I first added my dataset class to mmseg like this

from mmseg.datasets.builder import DATASETS
from mmseg.datasets.custom import CustomDataset
import os

classes = ('background', 'foreground')
palette = [[0,0,0], [255,255,255]]

@DATASETS.register_module()
class MeliDataset(CustomDataset):
    CLASSES = classes
    PALETTE = palette
    def __init__(self, split, **kwargs):
                  super().__init__(img_suffix='.jpg', seg_map_suffix='.png', split=split, reduce_zero_label=False, **kwargs)
                  assert os.path.exists(self.img_dir) and self.split is not None

and this is my training config for the deeplabv3 run (very similar for ocrnet, I've mostly used the default configs only)

norm_cfg = dict(type='SyncBN', requires_grad=True)
model = dict(
    type='EncoderDecoder',
    pretrained='open-mmlab://resnet50_v1c',
    backbone=dict(
        type='ResNetV1c',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        dilations=(1, 1, 2, 4),
        strides=(1, 2, 1, 1),
        norm_cfg=dict(type='SyncBN', requires_grad=True),
        norm_eval=False,
        style='pytorch',
        contract_dilation=True),
    decode_head=dict(
        type='DepthwiseSeparableASPPHead',
        in_channels=2048,
        in_index=3,
        channels=512,
        dilations=(1, 12, 24, 36),
        c1_in_channels=256,
        c1_channels=48,
        dropout_ratio=0.1,
        num_classes=2,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
        align_corners=False,
        loss_decode=dict(
            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
    auxiliary_head=dict(
        type='FCNHead',
        in_channels=1024,
        in_index=2,
        channels=256,
        num_convs=1,
        concat_input=False,
        dropout_ratio=0.1,
        num_classes=2,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
        align_corners=False,
        loss_decode=dict(
            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
    train_cfg=dict(),
    test_cfg=dict(mode='whole'))
log_config = dict(
    interval=50,
    hooks=[
        dict(type='TextLoggerHook', by_epoch=True),
        dict(type='TensorboardLoggerHook')
    ])
dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = None
resume_from = None
workflow = [('train', 1)]
cudnn_benchmark = True
dataset_type = 'MeliDataset'
train_data_root = '/media/train_hdd6/tarun/mmseg/meli_iter11/train'
val_data_root = '/media/train_hdd6/tarun/mmseg/meli_iter11/val'
data_root = '/media/train_hdd6/tarun/mmseg/meli_iter11/train'
train_split_file = '/media/train_hdd6/tarun/mmseg/meli_iter11/train.txt'
val_split_file = '/media/train_hdd6/tarun/mmseg/meli_iter11/val.txt'
img_dir = 'images'
ann_dir = 'labels'
img_norm_cfg = dict(
    mean=[103.94, 116.78, 123.68], std=[57.38, 57.12, 58.4], to_rgb=True)
crop_size = (512, 512)
train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='LoadAnnotations', reduce_zero_label=False),
    dict(type='Resize', img_scale=(512, 512), ratio_range=(0.5, 2.0)),
    dict(type='RandomCrop', crop_size=(512, 512), cat_max_ratio=0.75),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='PhotoMetricDistortion'),
    dict(
        type='Normalize',
        mean=[103.94, 116.78, 123.68],
        std=[57.38, 57.12, 58.4],
        to_rgb=True),
    dict(type='Pad', size=(512, 512), pad_val=0, seg_pad_val=255),
    dict(type='DefaultFormatBundle'),
    dict(type='Collect', keys=['img', 'gt_semantic_seg'])
]
test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(512, 512),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(
                type='Normalize',
                mean=[103.94, 116.78, 123.68],
                std=[57.38, 57.12, 58.4],
                to_rgb=True),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img'])
        ])
]
data = dict(
    samples_per_gpu=4,
    workers_per_gpu=8,
    train=dict(
        type='MeliDataset',
        data_root='/media/train_hdd6/tarun/mmseg/meli_iter11/train',
        img_dir='images',
        ann_dir='labels',
        pipeline=[
            dict(type='LoadImageFromFile'),
            dict(type='LoadAnnotations', reduce_zero_label=False),
            dict(type='Resize', img_scale=(512, 512), ratio_range=(0.5, 2.0)),
            dict(type='RandomCrop', crop_size=(512, 512), cat_max_ratio=0.75),
            dict(type='RandomFlip', flip_ratio=0.5),
            dict(type='PhotoMetricDistortion'),
            dict(
                type='Normalize',
                mean=[103.94, 116.78, 123.68],
                std=[57.38, 57.12, 58.4],
                to_rgb=True),
            dict(type='Pad', size=(512, 512), pad_val=0, seg_pad_val=255),
            dict(type='DefaultFormatBundle'),
            dict(type='Collect', keys=['img', 'gt_semantic_seg'])
        ],
        split='/media/train_hdd6/tarun/mmseg/meli_iter11/train.txt'),
    val=dict(
        type='MeliDataset',
        data_root='/media/train_hdd6/tarun/mmseg/meli_iter11/val',
        img_dir='images',
        ann_dir='labels',
        pipeline=[
            dict(type='LoadImageFromFile'),
            dict(
                type='MultiScaleFlipAug',
                img_scale=(512, 512),
                flip=False,
                transforms=[
                    dict(type='Resize', keep_ratio=True),
                    dict(type='RandomFlip'),
                    dict(
                        type='Normalize',
                        mean=[103.94, 116.78, 123.68],
                        std=[57.38, 57.12, 58.4],
                        to_rgb=True),
                    dict(type='ImageToTensor', keys=['img']),
                    dict(type='Collect', keys=['img'])
                ])
        ],
        split='/media/train_hdd6/tarun/mmseg/meli_iter11/val.txt'),
    test=dict(
        type='MeliDataset',
        data_root='/media/train_hdd6/tarun/mmseg/meli_iter11/val',
        img_dir='images',
        ann_dir='labels',
        pipeline=[
            dict(type='LoadImageFromFile'),
            dict(
                type='MultiScaleFlipAug',
                img_scale=(512, 512),
                flip=False,
                transforms=[
                    dict(type='Resize', keep_ratio=True),
                    dict(type='RandomFlip'),
                    dict(
                        type='Normalize',
                        mean=[103.94, 116.78, 123.68],
                        std=[57.38, 57.12, 58.4],
                        to_rgb=True),
                    dict(type='ImageToTensor', keys=['img']),
                    dict(type='Collect', keys=['img'])
                ])
        ],
        split='/media/train_hdd6/tarun/mmseg/meli_iter11/val.txt'))
optimizer = dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0005)
optimizer_config = dict()
lr_config = dict(policy='poly', power=0.9, min_lr=0.0001, by_epoch=False)
runner = dict(type='EpochBasedRunner', max_epochs=400)
evaluation = dict(metric='mIoU', pre_eval=True)
checkpoint_config = dict(interval=5, max_keep_ckpts=5)
work_dir = '/media/train_hdd6/tarun/mmseg/dlp_lr/work_dir/'
gpu_ids = range(0, 2)
auto_resume = False

I let it train for about 200 epochs and I tested the intermediate outputs regularly to see whats happening. At epoch 10 my results were something like this
image

and for the same image at epoch 200 it went back to looking like this
image

This is just one example of many many other examples, and I'm not sure what's causing this. The only things I changed in the config besides the custom dataset parts, is the crop size value to (512, 512). I've tried reducing my lr to 0.005 from the default 0.01 to see if it helps but it did not. I've also verified that there aren't any inconsistencies on the dataset side.

The training logs also seem fine, there's nothing unusual about the loss/acc curves for both my ocrnet and my deeplabv3+ runs.
image

I'd really appreciate it if you can help me understand what is causing this issue. I hope I've added enough context here. Thanks in advance!

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions