-
Notifications
You must be signed in to change notification settings - Fork 23
Closed as not planned
Description
Traceback (most recent call last):
File "", line 1, in
File "c:\Python311\Lib\multiprocessing\spawn.py", line 122, in spawn_main
exitcode = _main(fd, parent_sentinel)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:\Python311\Lib\multiprocessing\spawn.py", line 132, in _main
self = reduction.pickle.load(from_parent)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:\Python311\Lib\site-packages\transformers\tokenization_utils_base.py", line 1108, in getattr
raise AttributeError(f"{self.class.name} has no attribute {key}")
AttributeError: GPT2Tokenizer has no attribute modified_build_inputs_with_special_tokens. Did you mean: 'build_inputs_with_special_tokens'?
My train code:
import glob
from pathlib import Path
import multiprocessing as mp
import xml.etree.ElementTree as ET
from PIL import Image
from dataclasses import dataclass
@dataclass
class Word:
id: str
file_path: Path
writer_id: str
transcription: str
def get_words_from_xml(xml_file):
tree = ET.parse(xml_file)
root = tree.getroot()
root_id = root.get('id')
writer_id = root.get('writer-id')
xml_words = []
for line in root.findall('handwritten-part')[0].findall('line'):
for word in line.findall('word'):
image_file = Path([f for f in word_image_files if f.endswith(word.get('id') + '.png')][0])
try:
with Image.open(image_file) as _:
xml_words.append(
Word(
id=root_id,
file_path=image_file,
writer_id=writer_id,
transcription=word.get('text')
)
)
except Exception:
pass
return xml_words
import tqdm
from dtrocr.processor import DTrOCRProcessor
from dtrocr.config import DTrOCRConfig
from torch.utils.data import Dataset
class IAMDataset(Dataset):
def __init__(self, words: list[Word], config: DTrOCRConfig):
super(IAMDataset, self).__init__()
self.words = words
self.processor = DTrOCRProcessor(config, add_eos_token=True, add_bos_token=True)
def __len__(self):
return len(self.words)
def __getitem__(self, item):
inputs = self.processor(
images=Image.open(self.words[item].file_path).convert('RGB'),
texts=self.words[item].transcription,
padding='max_length',
return_tensors="pt",
return_labels=True,
)
return {
'pixel_values': inputs.pixel_values[0],
'input_ids': inputs.input_ids[0],
'attention_mask': inputs.attention_mask[0],
'labels': inputs.labels[0]
}
from typing import Tuple
from torch.utils.data import DataLoader
import torch
from dtrocr.model import DTrOCRLMHeadModel
def evaluate_model(model: torch.nn.Module, dataloader: DataLoader) -> Tuple[float, float]:
# set model to evaluation mode
model.eval()
losses, accuracies = [], []
with torch.no_grad():
for inputs in tqdm.tqdm(dataloader, total=len(dataloader), desc=f'Evaluating test set'):
inputs = send_inputs_to_device(inputs, device=0)
outputs = model(**inputs)
losses.append(outputs.loss.item())
accuracies.append(outputs.accuracy.item())
loss = sum(losses) / len(losses)
accuracy = sum(accuracies) / len(accuracies)
# set model back to training mode
model.train()
return loss, accuracy
def send_inputs_to_device(dictionary, device):
return {key: value.to(device=device) if isinstance(value, torch.Tensor) else value for key, value in dictionary.items()}
dataset_path = Path('j:/Comic translate/DTrOCR/iam_words')
xml_files = sorted(glob.glob(str(dataset_path / 'xml' / '*.xml')))[:100]
word_image_files = sorted(glob.glob(str(dataset_path / 'words' / '**' / '*.png'), recursive=True))
print(f"{len(xml_files)} XML files and {len(word_image_files)} word image files")
if __name__ == '__main__':
with mp.Pool(processes=mp.cpu_count()) as pool:
words_from_xmls = list(
tqdm.tqdm(
pool.imap(get_words_from_xml, xml_files),
total=len(xml_files),
desc='Building dataset'
)
)
words = [word for words in words_from_xmls for word in words]
with open('j:/Comic translate/DTrOCR/splits/train.uttlist') as fp:
train_ids = [line.replace('\n', '') for line in fp.readlines()]
with open('j:/Comic translate/DTrOCR/splits/test.uttlist') as fp:
test_ids = [line.replace('\n', '') for line in fp.readlines()]
with open('j:/Comic translate/DTrOCR/splits/validation.uttlist') as fp:
validation_ids = [line.replace('\n', '') for line in fp.readlines()]
print(f"Train size: {len(train_ids)}; Validation size: {len(validation_ids)}; Test size: {len(test_ids)}")
train_word_records = [word for word in words if word.id in train_ids]
validation_word_records = [word for word in words if word.id in validation_ids]
test_word_records = [word for word in words if word.id in test_ids]
print(f'Train size: {len(train_word_records)}; Validation size: {len(validation_word_records)}; Test size: {len(test_word_records)}')
print(f"{len(xml_files)} XML files and {len(word_image_files)} word image files")
# Load config and files
config = DTrOCRConfig(
# attn_implementation='flash_attention_2'
)
train_data = IAMDataset(words=train_word_records, config=config)
validation_data = IAMDataset(words=validation_word_records, config=config)
test_data = IAMDataset(words=test_word_records, config=config)
train_dataloader = DataLoader(train_data,
batch_size=32,
shuffle=True,
num_workers=mp.cpu_count())
validation_dataloader = DataLoader(validation_data, batch_size=32, shuffle=False, num_workers=mp.cpu_count())
test_dataloader = DataLoader(test_data, batch_size=32, shuffle=False, num_workers=mp.cpu_count())
torch.set_float32_matmul_precision('high')
model = DTrOCRLMHeadModel(config)
model = torch.compile(model)
model.to(device=0)
use_amp = True
scaler = torch.amp.GradScaler("cuda", enabled=use_amp)
optimiser = torch.optim.Adam(params=model.parameters(), lr=1e-4)
EPOCHS = 50
train_losses, train_accuracies = [], []
validation_losses, validation_accuracies = [], []
for epoch in range(EPOCHS):
epoch_losses, epoch_accuracies = [], []
for inputs in tqdm.tqdm(train_dataloader, total=len(train_dataloader), desc=f'Epoch {epoch + 1}'):
# set gradients to zero
optimiser.zero_grad()
# send inputs to same device as model
inputs = send_inputs_to_device(inputs, device=0)
# forward pass
with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=use_amp):
outputs = model(**inputs)
# calculate gradients
scaler.scale(outputs.loss).backward()
# update weights
scaler.step(optimiser)
scaler.update()
epoch_losses.append(outputs.loss.item())
epoch_accuracies.append(outputs.accuracy.item())
# store loss and metrics
train_losses.append(sum(epoch_losses) / len(epoch_losses))
train_accuracies.append(sum(epoch_accuracies) / len(epoch_accuracies))
# tests loss and accuracy
validation_loss, validation_accuracy = evaluate_model(model, validation_dataloader)
validation_losses.append(validation_loss)
validation_accuracies.append(validation_accuracy)
print(f"Epoch: {epoch + 1} - Train loss: {train_losses[-1]}, Train accuracy: {train_accuracies[-1]}, Validation loss: {validation_losses[-1]}, Validation accuracy: {validation_accuracies[-1]}")
I am on Windows 10.