Skip to content

AttributeError: GPT2Tokenizer has no attribute modified_build_inputs_with_special_tokens. Did you mean: 'build_inputs_with_special_tokens'? #34

@heinrichI

Description

@heinrichI

Traceback (most recent call last):
File "", line 1, in
File "c:\Python311\Lib\multiprocessing\spawn.py", line 122, in spawn_main
exitcode = _main(fd, parent_sentinel)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:\Python311\Lib\multiprocessing\spawn.py", line 132, in _main
self = reduction.pickle.load(from_parent)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:\Python311\Lib\site-packages\transformers\tokenization_utils_base.py", line 1108, in getattr
raise AttributeError(f"{self.class.name} has no attribute {key}")
AttributeError: GPT2Tokenizer has no attribute modified_build_inputs_with_special_tokens. Did you mean: 'build_inputs_with_special_tokens'?

My train code:

import glob
from pathlib import Path

import multiprocessing as mp
import xml.etree.ElementTree as ET

from PIL import Image
from dataclasses import dataclass


@dataclass
class Word:
    id: str
    file_path: Path
    writer_id: str
    transcription: str

def get_words_from_xml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    root_id = root.get('id')
    writer_id = root.get('writer-id')
    xml_words = []
    for line in root.findall('handwritten-part')[0].findall('line'):
        for word in line.findall('word'):
            image_file = Path([f for f in word_image_files if f.endswith(word.get('id') + '.png')][0])
            try:
                with Image.open(image_file) as _:
                    xml_words.append(
                        Word(
                            id=root_id,
                            file_path=image_file,
                            writer_id=writer_id,
                            transcription=word.get('text')
                        )
                    )
            except Exception:
                pass
            
    return xml_words

import tqdm

from dtrocr.processor import DTrOCRProcessor
from dtrocr.config import DTrOCRConfig

from torch.utils.data import Dataset

class IAMDataset(Dataset):
    def __init__(self, words: list[Word], config: DTrOCRConfig):
        super(IAMDataset, self).__init__()
        self.words = words
        self.processor = DTrOCRProcessor(config, add_eos_token=True, add_bos_token=True)
        
    def __len__(self):
        return len(self.words)
    
    def __getitem__(self, item):
        inputs = self.processor(
            images=Image.open(self.words[item].file_path).convert('RGB'),
            texts=self.words[item].transcription,
            padding='max_length',
            return_tensors="pt",
            return_labels=True,
        )
        return {
            'pixel_values': inputs.pixel_values[0],
            'input_ids': inputs.input_ids[0],
            'attention_mask': inputs.attention_mask[0],
            'labels': inputs.labels[0]
        }

from typing import Tuple

from torch.utils.data import DataLoader
import torch
from dtrocr.model import DTrOCRLMHeadModel

def evaluate_model(model: torch.nn.Module, dataloader: DataLoader) -> Tuple[float, float]:
    # set model to evaluation mode
    model.eval()
    
    losses, accuracies = [], []
    with torch.no_grad():
        for inputs in tqdm.tqdm(dataloader, total=len(dataloader), desc=f'Evaluating test set'):
            inputs = send_inputs_to_device(inputs, device=0)
            outputs = model(**inputs)
            
            losses.append(outputs.loss.item())
            accuracies.append(outputs.accuracy.item())
    
    loss = sum(losses) / len(losses)
    accuracy = sum(accuracies) / len(accuracies)
    
    # set model back to training mode
    model.train()
    
    return loss, accuracy

def send_inputs_to_device(dictionary, device):
    return {key: value.to(device=device) if isinstance(value, torch.Tensor) else value for key, value in dictionary.items()}


dataset_path = Path('j:/Comic translate/DTrOCR/iam_words')

xml_files = sorted(glob.glob(str(dataset_path / 'xml' / '*.xml')))[:100]
word_image_files = sorted(glob.glob(str(dataset_path / 'words' / '**' / '*.png'), recursive=True))

print(f"{len(xml_files)} XML files and {len(word_image_files)} word image files")


if __name__ == '__main__':

    with mp.Pool(processes=mp.cpu_count()) as pool:
        words_from_xmls = list(
            tqdm.tqdm(
                pool.imap(get_words_from_xml, xml_files), 
                total=len(xml_files),
                desc='Building dataset'
            )
        )

    words = [word for words in words_from_xmls for word in words]

    with open('j:/Comic translate/DTrOCR/splits/train.uttlist') as fp:
        train_ids = [line.replace('\n', '') for line in fp.readlines()]

    with open('j:/Comic translate/DTrOCR/splits/test.uttlist') as fp:
        test_ids = [line.replace('\n', '') for line in fp.readlines()]

    with open('j:/Comic translate/DTrOCR/splits/validation.uttlist') as fp:
        validation_ids = [line.replace('\n', '') for line in fp.readlines()]

    print(f"Train size: {len(train_ids)}; Validation size: {len(validation_ids)}; Test size: {len(test_ids)}")

    train_word_records = [word for word in words if word.id in train_ids]
    validation_word_records = [word for word in words if word.id in validation_ids]
    test_word_records = [word for word in words if word.id in test_ids]

    print(f'Train size: {len(train_word_records)}; Validation size: {len(validation_word_records)}; Test size: {len(test_word_records)}')


    print(f"{len(xml_files)} XML files and {len(word_image_files)} word image files")


    # Load config and files
    config = DTrOCRConfig(
        # attn_implementation='flash_attention_2'
    )
    
    train_data = IAMDataset(words=train_word_records, config=config)
    validation_data = IAMDataset(words=validation_word_records, config=config)
    test_data = IAMDataset(words=test_word_records, config=config)

    train_dataloader = DataLoader(train_data, 
        batch_size=32, 
        shuffle=True, 
        num_workers=mp.cpu_count())
    validation_dataloader = DataLoader(validation_data, batch_size=32, shuffle=False, num_workers=mp.cpu_count())
    test_dataloader = DataLoader(test_data, batch_size=32, shuffle=False, num_workers=mp.cpu_count())


    torch.set_float32_matmul_precision('high')



    model = DTrOCRLMHeadModel(config)
    model = torch.compile(model)
    model.to(device=0)

    
    use_amp = True
    scaler = torch.amp.GradScaler("cuda", enabled=use_amp)
    optimiser = torch.optim.Adam(params=model.parameters(), lr=1e-4)

    EPOCHS = 50
    train_losses, train_accuracies = [], []
    validation_losses, validation_accuracies = [], []
    for epoch in range(EPOCHS):
        epoch_losses, epoch_accuracies = [], []
        for inputs in tqdm.tqdm(train_dataloader, total=len(train_dataloader), desc=f'Epoch {epoch + 1}'):
            
            # set gradients to zero
            optimiser.zero_grad()
            
            # send inputs to same device as model
            inputs = send_inputs_to_device(inputs, device=0)
            
            # forward pass
            with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=use_amp):
                outputs = model(**inputs)
            
            # calculate gradients
            scaler.scale(outputs.loss).backward()
            
            # update weights
            scaler.step(optimiser)
            scaler.update()
            
            epoch_losses.append(outputs.loss.item())
            epoch_accuracies.append(outputs.accuracy.item())
            
        # store loss and metrics
        train_losses.append(sum(epoch_losses) / len(epoch_losses))
        train_accuracies.append(sum(epoch_accuracies) / len(epoch_accuracies))
        
        # tests loss and accuracy
        validation_loss, validation_accuracy = evaluate_model(model, validation_dataloader)
        validation_losses.append(validation_loss)
        validation_accuracies.append(validation_accuracy)
                        
        print(f"Epoch: {epoch + 1} - Train loss: {train_losses[-1]}, Train accuracy: {train_accuracies[-1]}, Validation loss: {validation_losses[-1]}, Validation accuracy: {validation_accuracies[-1]}")

I am on Windows 10.

Metadata

Metadata

Assignees

No one assigned

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions