-
Notifications
You must be signed in to change notification settings - Fork 94
Open
Description
Here is an extended croissant draft for CCF crawls. Please give feedback.
@benjelloun @wumpus FYI.
What we dont have syntax for atm, and related issues:
- Lineage: Lineage / provenance representation #738
- Size information for FileSets: We have size information for whole File Sets (the whole warc files), afaiu FileSet now doesnt have
contentSizeinfo. (FileObject has.) - Parquet schemas for Columnar index: Afaiu parquet schemas are not supported yet - Will it be?
Draft Croissant
EDITS:
- Corrected version to include build-version (2025-10-16)
- Updated croissant specs version to
http://mlcommons.org/croissant/1.1, noticed some datasets already use it, though it is not available at this link yet. (2025-10-16) - Added base IRI:
"https://data.commoncrawl.org/crawl-data/CC-MAIN-YYYY-WW/"(2025-10-16)
{
"@context": {
"@base": "cr_base_iri/",
"@language": "en",
"@vocab": "https://schema.org/",
"sc": "https://schema.org/",
"cr": "http://mlcommons.org/croissant/",
"rai": "http://mlcommons.org/croissant/RAI/",
"dct": "http://purl.org/dc/terms/",
"citeAs": "cr:citeAs",
"column": "cr:column",
"conformsTo": "dct:conformsTo",
"data": {
"@id": "cr:data",
"@type": "@json"
},
"dataType": {
"@id": "cr:dataType",
"@type": "@vocab"
},
"examples": {
"@id": "cr:examples",
"@type": "@json"
},
"extract": "cr:extract",
"field": "cr:field",
"fileProperty": "cr:fileProperty",
"fileObject": "cr:fileObject",
"fileSet": "cr:fileSet",
"format": "cr:format",
"includes": "cr:includes",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
"parentField": "cr:parentField",
"path": "cr:path",
"recordSet": "cr:recordSet",
"references": "cr:references",
"regex": "cr:regex",
"repeated": "cr:repeated",
"replace": "cr:replace",
"separator": "cr:separator",
"source": "cr:source",
"subField": "cr:subField",
"transform": "cr:transform",
"@base": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/"
},
"@type": "sc:Dataset",
"conformsTo": "http://mlcommons.org/croissant/1.1",
"name": "CC-MAIN-2022-05",
"description": "Common Crawl January 2022 Crawl Archive",
"license": "https://commoncrawl.org/terms-of-use",
"url": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/index.html",
"creator": {
"@type": "Organization",
"name": "The Common Crawl Foundation",
"url": "https://commoncrawl.org/"
},
"citeAs": "https://commoncrawl.org/",
"version": "1.0.0+20251015",
"datePublished": "2022-01-29T15:24:05Z",
"temporalCoverage": "2022-01-16T09:31:37Z/2022-01-29T15:24:05Z",
"distribution": [
{
"@type": "cr:FileObject",
"@id": "warc.paths.gz",
"contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/warc.paths.gz",
"encodingFormat": "application/gzip",
"sha256": "e0131030d08e07d93362feae152f3854e23093f623edd6757bb270a92f91b55b"
},
{
"@type": "cr:FileSet",
"@id": "warc-paths",
"containedIn": {
"@id": "warc.paths.gz"
},
"encodingFormat": "application/warc",
"includes": "*.warc.gz"
},
{
"@type": "cr:FileObject",
"@id": "wat.paths.gz",
"contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/wat.paths.gz",
"encodingFormat": "application/gzip",
"sha256": "c61ecb0b91767f334621da59a84ecf2b858018e5cb0c3f36789a6ac77cb6ca54"
},
{
"@type": "cr:FileSet",
"@id": "wat-paths",
"containedIn": {
"@id": "wat.paths.gz"
},
"encodingFormat": "application/warc",
"includes": "*.warc.wat.gz"
},
{
"@type": "cr:FileObject",
"@id": "wet.paths.gz",
"contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/wet.paths.gz",
"encodingFormat": "application/gzip",
"sha256": "36650ea4871ee5ed5da0a5d23ab4d6050dc2d2792c86980d76e1167d8e354a25"
},
{
"@type": "cr:FileSet",
"@id": "wet-paths",
"containedIn": {
"@id": "wet.paths.gz"
},
"encodingFormat": "application/warc",
"includes": "*.warc.wet.gz"
},
{
"@type": "cr:FileObject",
"@id": "robotstxt.paths.gz",
"contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/robotstxt.paths.gz",
"encodingFormat": "application/gzip",
"sha256": "239c08056c67673e53d1e55d1903669c8f0f5ce3d867ba56357c7f87f5387621"
},
{
"@type": "cr:FileSet",
"@id": "robotstxt-paths",
"containedIn": {
"@id": "robotstxt.paths.gz"
},
"encodingFormat": "application/warc",
"includes": "*.warc.gz"
},
{
"@type": "cr:FileObject",
"@id": "non200responses.paths.gz",
"contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/non200responses.paths.gz",
"encodingFormat": "application/gzip",
"sha256": "69a973bdc7c9f6ed60f80286fdc881db4265208adc6f0ba05488e8d52f4d9738"
},
{
"@type": "cr:FileSet",
"@id": "non200responses-paths",
"containedIn": {
"@id": "non200responses.paths.gz"
},
"encodingFormat": "application/warc",
"includes": "*.warc.gz"
},
{
"@type": "cr:FileObject",
"@id": "cc-index.paths.gz",
"contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/cc-index.paths.gz",
"encodingFormat": "application/gzip",
"sha256": "29e5ad77d5be5c2e0c8adc25231174966d82257a78d4e3d9441a10e55c0086e0"
},
{
"@type": "cr:FileSet",
"@id": "cc-index-paths",
"containedIn": {
"@id": "cc-index.paths.gz"
},
"encodingFormat": "application/gzip",
"includes": "*.gz"
},
{
"@type": "cr:FileObject",
"@id": "cc-index-table.paths.gz",
"contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/cc-index-table.paths.gz",
"encodingFormat": "application/gzip",
"sha256": "e38cdc860dfb831d9f69ef7d16ce5bcdb7e862d57611335ad2b689c9a4cf5436"
},
{
"@type": "cr:FileSet",
"@id": "cc-index-table-paths",
"containedIn": {
"@id": "cc-index-table.paths.gz"
},
"encodingFormat": "application/parquet",
"includes": "*.gz.parquet"
}
]
}
Metadata
Metadata
Assignees
Labels
No labels