-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcorpus.py
27 lines (19 loc) · 932 Bytes
/
corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from typing import Generator, Tuple, Optional, Dict
import tensorflow_datasets as tfds
class WikipediaCorpus:
def __init__(self, name='wikipedia/20230601.en'):
self.ds, self.info = tfds.load(name=name, split="train", with_info=True, as_supervised=False)
def num_entries(self):
return self.info.splits['train'].num_examples
def iterator(self, max_total: Optional[int] = None) -> Generator[Tuple[str, Dict[str, any]], None, None]:
"""Corpus iterator yields tuples of identifiers and documents"""
total = 0
for row in self.ds:
if total == max_total:
break
title = row["title"].numpy().decode("utf-8")
text = row["text"].numpy().decode("utf-8")
# we use the element's index in the dataset as the document id
id = str(total)
yield id, {"title": title, "text": text}
total += 1