corpus.py

from typing import Generator, Tuple, Optional, Dict

import tensorflow_datasets as tfds


class WikipediaCorpus:

    def __init__(self, name='wikipedia/20230601.en'):
        self.ds, self.info = tfds.load(name=name, split="train", with_info=True, as_supervised=False)

    def num_entries(self):
        return self.info.splits['train'].num_examples

    def iterator(self, max_total: Optional[int] = None) -> Generator[Tuple[str, Dict[str, any]], None, None]:
        """Corpus iterator yields tuples of identifiers and documents"""
        total = 0
        for row in self.ds:
            if total == max_total:
                break
            title = row["title"].numpy().decode("utf-8")
            text = row["text"].numpy().decode("utf-8")
            # we use the element's index in the dataset as the document id
            id = str(total)
            yield id, {"title": title, "text": text}
            total += 1