Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
251 changes: 251 additions & 0 deletions datasets/1.1/commoncrawl-CC-MAIN-2025-43-draft/metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
{
"@context": {
"@language": "en",
"@vocab": "https://schema.org/",
"sc": "https://schema.org/",
"cr": "http://mlcommons.org/croissant/",
"rai": "http://mlcommons.org/croissant/RAI/",
"dct": "http://purl.org/dc/terms/",
"prov": "http://www.w3.org/ns/prov#",
"citeAs": "cr:citeAs",
"column": "cr:column",
"conformsTo": "dct:conformsTo",
"data": {
"@id": "cr:data",
"@type": "@json"
},
"dataType": {
"@id": "cr:dataType",
"@type": "@vocab"
},
"equivalentProperty": "cr:equivalentProperty",
"examples": {
"@id": "cr:examples",
"@type": "@json"
},
"extract": "cr:extract",
"field": "cr:field",
"fileProperty": "cr:fileProperty",
"fileObject": "cr:fileObject",
"fileSet": "cr:fileSet",
"format": "cr:format",
"includes": "cr:includes",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
"parentField": "cr:parentField",
"path": "cr:path",
"recordSet": "cr:recordSet",
"references": "cr:references",
"regex": "cr:regex",
"readLines": "cr:readLines",
"repeated": "cr:repeated",
"replace": "cr:replace",
"sdVersion": "cr:sdVersion",
"separator": "cr:separator",
"source": "cr:source",
"subField": "cr:subField",
"transform": "cr:transform",
"unArchive": "cr:unArchive",
"value": "cr:value",
"@base": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-43/"
},
"@type": [
"sc:Dataset",
"prov:Entity"
],
"conformsTo": "http://mlcommons.org/croissant/1.1",
"name": "CC-MAIN-2025-43",
"description": "Common Crawl October 2025 Crawl Archive",
"license": "https://commoncrawl.org/terms-of-use",
"url": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-43/index.html",
"creator": {
"@type": "Organization",
"name": "The Common Crawl Foundation",
"url": "https://commoncrawl.org/"
},
"citeAs": "https://commoncrawl.org/",
"version": "1.0.0",
"sdVersion": "0.1.0",
"datePublished": "2025-10-19T01:06:58Z",
"temporalCoverage": "2025-10-05T11:42:39Z/2025-10-19T01:06:58Z",
"distribution": [
{
"@type": "cr:FileObject",
"@id": "warc.paths.gz",
"contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-43/warc.paths.gz",
"encodingFormat": "application/gzip",
"sha256": "97441dcd9ffe73099b4238cc4c327b0adffc72137b712a886b7f913e7a68ebfc"
},
{
"@type": "cr:FileSet",
"@id": "warc-files",
"containedIn": {
"@type": "cr:DataSource",
"fileObject": {
"@id": "warc.paths.gz"
},
"transform": {
"unArchive": true,
"readLines": true
}
},
"encodingFormat": "application/warc",
"includes": "*.warc.gz",
"size": "97.73 GB"
},
{
"@type": "cr:FileObject",
"@id": "wat.paths.gz",
"contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-43/wat.paths.gz",
"encodingFormat": "application/gzip",
"sha256": "46ffdf3190586953f681da396b900c9c19d6e0d247d33f98ca43e4e5cc344357"
},
{
"@type": "cr:FileSet",
"@id": "wat-files",
"containedIn": {
"@type": "cr:DataSource",
"fileObject": {
"@id": "wat.paths.gz"
},
"transform": {
"unArchive": true,
"readLines": true
}
},
"encodingFormat": "application/warc",
"includes": "*.warc.wat.gz",
"size": "18.39 GB"
},
{
"@type": "cr:FileObject",
"@id": "wet.paths.gz",
"contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-43/wet.paths.gz",
"encodingFormat": "application/gzip",
"sha256": "d5205d69c87a180c7f93d72927cc29d6a7e77ceade16e3c4fae3d91e3b4bb7ae"
},
{
"@type": "cr:FileSet",
"@id": "wet-files",
"containedIn": {
"@type": "cr:DataSource",
"fileObject": {
"@id": "wet.paths.gz"
},
"transform": {
"unArchive": true,
"readLines": true
}
},
"encodingFormat": "application/warc",
"includes": "*.warc.wet.gz",
"size": "7.38 GB"
},
{
"@type": "cr:FileObject",
"@id": "robotstxt.paths.gz",
"contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-43/robotstxt.paths.gz",
"encodingFormat": "application/gzip",
"sha256": "9d06ffda119bb7c8db1c706f9b1d3f5d5926632ab88f7edc802dc720f0239674"
},
{
"@type": "cr:FileSet",
"@id": "robotstxt-files",
"containedIn": {
"@type": "cr:DataSource",
"fileObject": {
"@id": "robotstxt.paths.gz"
},
"transform": {
"unArchive": true,
"readLines": true
}
},
"encodingFormat": "application/warc",
"includes": "*.warc.gz",
"size": "0.15 GB"
},
{
"@type": "cr:FileObject",
"@id": "non200responses.paths.gz",
"contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-43/non200responses.paths.gz",
"encodingFormat": "application/gzip",
"sha256": "6835a912596a48bc1a097c1bf2ca61bfe0b35dc1c4b1db7e8935baa9cb34c5a8"
},
{
"@type": "cr:FileSet",
"@id": "non200responses-files",
"containedIn": {
"@type": "cr:DataSource",
"fileObject": {
"@id": "non200responses.paths.gz"
},
"transform": {
"unArchive": true,
"readLines": true
}
},
"encodingFormat": "application/warc",
"includes": "*.warc.gz",
"size": "3.07 GB"
},
{
"@type": "cr:FileObject",
"@id": "cc-index.paths.gz",
"contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-43/cc-index.paths.gz",
"encodingFormat": "application/gzip",
"sha256": "e9ebd0ba0e8ef9e648b3c81e90ed8ff3934ac23d38d05b33323942dc643eb650"
},
{
"@type": "cr:FileSet",
"@id": "cc-index-files",
"containedIn": {
"@type": "cr:DataSource",
"fileObject": {
"@id": "cc-index.paths.gz"
},
"transform": {
"unArchive": true,
"readLines": true
}
},
"encodingFormat": "application/gzip",
"includes": "*.gz",
"size": "0.20 GB"
},
{
"@type": "cr:FileObject",
"@id": "cc-index-table.paths.gz",
"contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-43/cc-index-table.paths.gz",
"encodingFormat": "application/gzip",
"sha256": "404325746c1b5968b94afb4d5ac355dbe6eb6a1e05b6a5031f6a79e413e55965"
},
{
"@type": "cr:FileSet",
"@id": "cc-index-table-files",
"containedIn": {
"@type": "cr:DataSource",
"fileObject": {
"@id": "cc-index-table.paths.gz"
},
"transform": {
"unArchive": true,
"readLines": true
}
},
"encodingFormat": "application/parquet",
"includes": "*.gz.parquet",
"size": "0.23 GB"
},
{
"@type": "cr:FileObject",
"@id": "CC-MAIN-2025-43.domains-top-1000",
"contentUrl": "s3://commoncrawl-dev/test-top-1000-domains-v1/CC-MAIN-2025-43.domains-top-1000.csv.gz",
"encodingFormat": "application/gzip",
"sha256": "d2816340908ecfa20db7a248f668a2470ddefe2bbc7ec652965fb78709455664",
"contentSize": "8813 B"
}
]
}
Loading