Skip to content

Commit c1ac775

Browse files
feat: add example 1.1 dataset: metadata.json draft for commoncrawl CC-MAIN-2025-43
1 parent df30323 commit c1ac775

File tree

1 file changed

+251
-0
lines changed

1 file changed

+251
-0
lines changed
Lines changed: 251 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,251 @@
1+
{
2+
"@context": {
3+
"@language": "en",
4+
"@vocab": "https://schema.org/",
5+
"sc": "https://schema.org/",
6+
"cr": "http://mlcommons.org/croissant/",
7+
"rai": "http://mlcommons.org/croissant/RAI/",
8+
"dct": "http://purl.org/dc/terms/",
9+
"prov": "http://www.w3.org/ns/prov#",
10+
"citeAs": "cr:citeAs",
11+
"column": "cr:column",
12+
"conformsTo": "dct:conformsTo",
13+
"data": {
14+
"@id": "cr:data",
15+
"@type": "@json"
16+
},
17+
"dataType": {
18+
"@id": "cr:dataType",
19+
"@type": "@vocab"
20+
},
21+
"equivalentProperty": "cr:equivalentProperty",
22+
"examples": {
23+
"@id": "cr:examples",
24+
"@type": "@json"
25+
},
26+
"extract": "cr:extract",
27+
"field": "cr:field",
28+
"fileProperty": "cr:fileProperty",
29+
"fileObject": "cr:fileObject",
30+
"fileSet": "cr:fileSet",
31+
"format": "cr:format",
32+
"includes": "cr:includes",
33+
"isLiveDataset": "cr:isLiveDataset",
34+
"jsonPath": "cr:jsonPath",
35+
"key": "cr:key",
36+
"md5": "cr:md5",
37+
"parentField": "cr:parentField",
38+
"path": "cr:path",
39+
"recordSet": "cr:recordSet",
40+
"references": "cr:references",
41+
"regex": "cr:regex",
42+
"readLines": "cr:readLines",
43+
"repeated": "cr:repeated",
44+
"replace": "cr:replace",
45+
"sdVersion": "cr:sdVersion",
46+
"separator": "cr:separator",
47+
"source": "cr:source",
48+
"subField": "cr:subField",
49+
"transform": "cr:transform",
50+
"unArchive": "cr:unArchive",
51+
"value": "cr:value",
52+
"@base": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-43/"
53+
},
54+
"@type": [
55+
"sc:Dataset",
56+
"prov:Entity"
57+
],
58+
"conformsTo": "http://mlcommons.org/croissant/1.1",
59+
"name": "CC-MAIN-2025-43",
60+
"description": "Common Crawl October 2025 Crawl Archive",
61+
"license": "https://commoncrawl.org/terms-of-use",
62+
"url": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-43/index.html",
63+
"creator": {
64+
"@type": "Organization",
65+
"name": "The Common Crawl Foundation",
66+
"url": "https://commoncrawl.org/"
67+
},
68+
"citeAs": "https://commoncrawl.org/",
69+
"version": "1.0.0",
70+
"sdVersion": "0.1.0",
71+
"datePublished": "2025-10-19T01:06:58Z",
72+
"temporalCoverage": "2025-10-05T11:42:39Z/2025-10-19T01:06:58Z",
73+
"distribution": [
74+
{
75+
"@type": "cr:FileObject",
76+
"@id": "warc.paths.gz",
77+
"contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-43/warc.paths.gz",
78+
"encodingFormat": "application/gzip",
79+
"sha256": "97441dcd9ffe73099b4238cc4c327b0adffc72137b712a886b7f913e7a68ebfc"
80+
},
81+
{
82+
"@type": "cr:FileSet",
83+
"@id": "warc-files",
84+
"containedIn": {
85+
"@type": "cr:DataSource",
86+
"fileObject": {
87+
"@id": "warc.paths.gz"
88+
},
89+
"transform": {
90+
"unArchive": true,
91+
"readLines": true
92+
}
93+
},
94+
"encodingFormat": "application/warc",
95+
"includes": "*.warc.gz",
96+
"size": "97.73 GB"
97+
},
98+
{
99+
"@type": "cr:FileObject",
100+
"@id": "wat.paths.gz",
101+
"contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-43/wat.paths.gz",
102+
"encodingFormat": "application/gzip",
103+
"sha256": "46ffdf3190586953f681da396b900c9c19d6e0d247d33f98ca43e4e5cc344357"
104+
},
105+
{
106+
"@type": "cr:FileSet",
107+
"@id": "wat-files",
108+
"containedIn": {
109+
"@type": "cr:DataSource",
110+
"fileObject": {
111+
"@id": "wat.paths.gz"
112+
},
113+
"transform": {
114+
"unArchive": true,
115+
"readLines": true
116+
}
117+
},
118+
"encodingFormat": "application/warc",
119+
"includes": "*.warc.wat.gz",
120+
"size": "18.39 GB"
121+
},
122+
{
123+
"@type": "cr:FileObject",
124+
"@id": "wet.paths.gz",
125+
"contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-43/wet.paths.gz",
126+
"encodingFormat": "application/gzip",
127+
"sha256": "d5205d69c87a180c7f93d72927cc29d6a7e77ceade16e3c4fae3d91e3b4bb7ae"
128+
},
129+
{
130+
"@type": "cr:FileSet",
131+
"@id": "wet-files",
132+
"containedIn": {
133+
"@type": "cr:DataSource",
134+
"fileObject": {
135+
"@id": "wet.paths.gz"
136+
},
137+
"transform": {
138+
"unArchive": true,
139+
"readLines": true
140+
}
141+
},
142+
"encodingFormat": "application/warc",
143+
"includes": "*.warc.wet.gz",
144+
"size": "7.38 GB"
145+
},
146+
{
147+
"@type": "cr:FileObject",
148+
"@id": "robotstxt.paths.gz",
149+
"contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-43/robotstxt.paths.gz",
150+
"encodingFormat": "application/gzip",
151+
"sha256": "9d06ffda119bb7c8db1c706f9b1d3f5d5926632ab88f7edc802dc720f0239674"
152+
},
153+
{
154+
"@type": "cr:FileSet",
155+
"@id": "robotstxt-files",
156+
"containedIn": {
157+
"@type": "cr:DataSource",
158+
"fileObject": {
159+
"@id": "robotstxt.paths.gz"
160+
},
161+
"transform": {
162+
"unArchive": true,
163+
"readLines": true
164+
}
165+
},
166+
"encodingFormat": "application/warc",
167+
"includes": "*.warc.gz",
168+
"size": "0.15 GB"
169+
},
170+
{
171+
"@type": "cr:FileObject",
172+
"@id": "non200responses.paths.gz",
173+
"contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-43/non200responses.paths.gz",
174+
"encodingFormat": "application/gzip",
175+
"sha256": "6835a912596a48bc1a097c1bf2ca61bfe0b35dc1c4b1db7e8935baa9cb34c5a8"
176+
},
177+
{
178+
"@type": "cr:FileSet",
179+
"@id": "non200responses-files",
180+
"containedIn": {
181+
"@type": "cr:DataSource",
182+
"fileObject": {
183+
"@id": "non200responses.paths.gz"
184+
},
185+
"transform": {
186+
"unArchive": true,
187+
"readLines": true
188+
}
189+
},
190+
"encodingFormat": "application/warc",
191+
"includes": "*.warc.gz",
192+
"size": "3.07 GB"
193+
},
194+
{
195+
"@type": "cr:FileObject",
196+
"@id": "cc-index.paths.gz",
197+
"contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-43/cc-index.paths.gz",
198+
"encodingFormat": "application/gzip",
199+
"sha256": "e9ebd0ba0e8ef9e648b3c81e90ed8ff3934ac23d38d05b33323942dc643eb650"
200+
},
201+
{
202+
"@type": "cr:FileSet",
203+
"@id": "cc-index-files",
204+
"containedIn": {
205+
"@type": "cr:DataSource",
206+
"fileObject": {
207+
"@id": "cc-index.paths.gz"
208+
},
209+
"transform": {
210+
"unArchive": true,
211+
"readLines": true
212+
}
213+
},
214+
"encodingFormat": "application/gzip",
215+
"includes": "*.gz",
216+
"size": "0.20 GB"
217+
},
218+
{
219+
"@type": "cr:FileObject",
220+
"@id": "cc-index-table.paths.gz",
221+
"contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-43/cc-index-table.paths.gz",
222+
"encodingFormat": "application/gzip",
223+
"sha256": "404325746c1b5968b94afb4d5ac355dbe6eb6a1e05b6a5031f6a79e413e55965"
224+
},
225+
{
226+
"@type": "cr:FileSet",
227+
"@id": "cc-index-table-files",
228+
"containedIn": {
229+
"@type": "cr:DataSource",
230+
"fileObject": {
231+
"@id": "cc-index-table.paths.gz"
232+
},
233+
"transform": {
234+
"unArchive": true,
235+
"readLines": true
236+
}
237+
},
238+
"encodingFormat": "application/parquet",
239+
"includes": "*.gz.parquet",
240+
"size": "0.23 GB"
241+
},
242+
{
243+
"@type": "cr:FileObject",
244+
"@id": "CC-MAIN-2025-43.domains-top-1000",
245+
"contentUrl": "s3://commoncrawl-dev/test-top-1000-domains-v1/CC-MAIN-2025-43.domains-top-1000.csv.gz",
246+
"encodingFormat": "application/gzip",
247+
"sha256": "d2816340908ecfa20db7a248f668a2470ddefe2bbc7ec652965fb78709455664",
248+
"contentSize": "8813 B"
249+
}
250+
]
251+
}

0 commit comments

Comments
 (0)