Skip to content

Commit 53e8310

Browse files
authored
Include more value types to describe field's precision more accurately (#820)
Include more precise value types. Also included one example datasets, https://huggingface.co/datasets/pollen-robotics/apple_storage, to showcase the differences between 1.0 (same Croissant as in: https://huggingface.co/api/datasets/pollen-robotics/apple_storage/croissant) and the proposed modifications that would go into 1.1 (see the relative output folders). The fields' precision was taken from: https://datasets-server.huggingface.co/info?dataset=pollen-robotics/apple_storage
1 parent 60644e0 commit 53e8310

File tree

6 files changed

+523
-0
lines changed

6 files changed

+523
-0
lines changed
Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
{
2+
"@context": {
3+
"@language": "en",
4+
"@vocab": "https://schema.org/",
5+
"citeAs": "cr:citeAs",
6+
"column": "cr:column",
7+
"conformsTo": "dct:conformsTo",
8+
"cr": "http://mlcommons.org/croissant/",
9+
"data": {
10+
"@id": "cr:data",
11+
"@type": "@json"
12+
},
13+
"dataBiases": "cr:dataBiases",
14+
"dataCollection": "cr:dataCollection",
15+
"dataType": {
16+
"@id": "cr:dataType",
17+
"@type": "@vocab"
18+
},
19+
"dct": "http://purl.org/dc/terms/",
20+
"extract": "cr:extract",
21+
"field": "cr:field",
22+
"fileProperty": "cr:fileProperty",
23+
"fileObject": "cr:fileObject",
24+
"fileSet": "cr:fileSet",
25+
"format": "cr:format",
26+
"includes": "cr:includes",
27+
"isLiveDataset": "cr:isLiveDataset",
28+
"jsonPath": "cr:jsonPath",
29+
"key": "cr:key",
30+
"md5": "cr:md5",
31+
"parentField": "cr:parentField",
32+
"path": "cr:path",
33+
"personalSensitiveInformation": "cr:personalSensitiveInformation",
34+
"recordSet": "cr:recordSet",
35+
"references": "cr:references",
36+
"regex": "cr:regex",
37+
"repeated": "cr:repeated",
38+
"replace": "cr:replace",
39+
"sc": "https://schema.org/",
40+
"separator": "cr:separator",
41+
"source": "cr:source",
42+
"subField": "cr:subField",
43+
"transform": "cr:transform"
44+
},
45+
"@type": "sc:Dataset",
46+
"distribution": [
47+
{
48+
"@type": "cr:FileObject",
49+
"@id": "repo",
50+
"name": "repo",
51+
"description": "The Hugging Face git repository.",
52+
"contentUrl": "https://huggingface.co/datasets/pollen-robotics/apple_storage/tree/refs%2Fconvert%2Fparquet",
53+
"encodingFormat": "git+https",
54+
"sha256": "https://github.com/mlcommons/croissant/issues/80"
55+
},
56+
{
57+
"@type": "cr:FileSet",
58+
"@id": "parquet-files-for-config-default",
59+
"name": "parquet-files-for-config-default",
60+
"description": "The underlying Parquet files as converted by Hugging Face (see: https://huggingface.co/docs/dataset-viewer/parquet).",
61+
"containedIn": {
62+
"@id": "repo"
63+
},
64+
"encodingFormat": "application/x-parquet",
65+
"includes": "default/*/*.parquet"
66+
}
67+
],
68+
"recordSet": [
69+
{
70+
"@type": "cr:RecordSet",
71+
"dataType": "cr:Split",
72+
"key": {
73+
"@id": "default_splits/split_name"
74+
},
75+
"@id": "default_splits",
76+
"name": "default_splits",
77+
"description": "Splits for the default config.",
78+
"field": [
79+
{
80+
"@type": "cr:Field",
81+
"@id": "default_splits/split_name",
82+
"name": "split_name",
83+
"description": "The name of the split.",
84+
"dataType": "sc:Text"
85+
}
86+
],
87+
"data": {
88+
"default_splits/split_name": "train"
89+
}
90+
},
91+
{
92+
"@type": "cr:RecordSet",
93+
"@id": "default",
94+
"name": "default",
95+
"description": "pollen-robotics/apple_storage - 'default' subset",
96+
"field": [
97+
{
98+
"@type": "cr:Field",
99+
"@id": "default/split",
100+
"name": "default/split",
101+
"description": "Split to which the example belongs to.",
102+
"dataType": "sc:Text",
103+
"source": {
104+
"fileSet": {
105+
"@id": "parquet-files-for-config-default"
106+
},
107+
"extract": {
108+
"fileProperty": "fullpath"
109+
},
110+
"transform": {
111+
"regex": "default/(?:partial-)?(train)/.+parquet$"
112+
}
113+
},
114+
"references": {
115+
"field": {
116+
"@id": "default_splits/split_name"
117+
}
118+
}
119+
},
120+
{
121+
"@type": "cr:Field",
122+
"@id": "default/observation.state",
123+
"name": "default/observation.state",
124+
"description": "Column 'observation.state' from the Hugging Face parquet file.",
125+
"dataType": "sc:Float",
126+
"source": {
127+
"fileSet": {
128+
"@id": "parquet-files-for-config-default"
129+
},
130+
"extract": {
131+
"column": "observation.state"
132+
}
133+
},
134+
"repeated": true
135+
},
136+
{
137+
"@type": "cr:Field",
138+
"@id": "default/action",
139+
"name": "default/action",
140+
"description": "Column 'action' from the Hugging Face parquet file.",
141+
"dataType": "sc:Float",
142+
"source": {
143+
"fileSet": {
144+
"@id": "parquet-files-for-config-default"
145+
},
146+
"extract": {
147+
"column": "action"
148+
}
149+
},
150+
"repeated": true
151+
},
152+
{
153+
"@type": "cr:Field",
154+
"@id": "default/timestamp",
155+
"name": "default/timestamp",
156+
"description": "Column 'timestamp' from the Hugging Face parquet file.",
157+
"dataType": "sc:Float",
158+
"source": {
159+
"fileSet": {
160+
"@id": "parquet-files-for-config-default"
161+
},
162+
"extract": {
163+
"column": "timestamp"
164+
}
165+
}
166+
},
167+
{
168+
"@type": "cr:Field",
169+
"@id": "default/frame_index",
170+
"name": "default/frame_index",
171+
"description": "Column 'frame_index' from the Hugging Face parquet file.",
172+
"dataType": "sc:Integer",
173+
"source": {
174+
"fileSet": {
175+
"@id": "parquet-files-for-config-default"
176+
},
177+
"extract": {
178+
"column": "frame_index"
179+
}
180+
}
181+
},
182+
{
183+
"@type": "cr:Field",
184+
"@id": "default/episode_index",
185+
"name": "default/episode_index",
186+
"description": "Column 'episode_index' from the Hugging Face parquet file.",
187+
"dataType": "sc:Integer",
188+
"source": {
189+
"fileSet": {
190+
"@id": "parquet-files-for-config-default"
191+
},
192+
"extract": {
193+
"column": "episode_index"
194+
}
195+
}
196+
},
197+
{
198+
"@type": "cr:Field",
199+
"@id": "default/index",
200+
"name": "default/index",
201+
"description": "Column 'index' from the Hugging Face parquet file.",
202+
"dataType": "sc:Integer",
203+
"source": {
204+
"fileSet": {
205+
"@id": "parquet-files-for-config-default"
206+
},
207+
"extract": {
208+
"column": "index"
209+
}
210+
}
211+
},
212+
{
213+
"@type": "cr:Field",
214+
"@id": "default/task_index",
215+
"name": "default/task_index",
216+
"description": "Column 'task_index' from the Hugging Face parquet file.",
217+
"dataType": "sc:Integer",
218+
"source": {
219+
"fileSet": {
220+
"@id": "parquet-files-for-config-default"
221+
},
222+
"extract": {
223+
"column": "task_index"
224+
}
225+
}
226+
}
227+
]
228+
}
229+
],
230+
"conformsTo": "http://mlcommons.org/croissant/1.0",
231+
"name": "apple_storage",
232+
"description": "This dataset was created using LeRobot.\n\n\t\n\t\t\n\t\n\t\n\t\tDataset Structure\n\t\n\nmeta/info.json:\n{\n \"codebase_version\": \"v2.0\",\n \"robot_type\": \"reachy2\",\n \"total_episodes\": 50,\n \"total_frames\": 14983,\n \"total_tasks\": 1,\n \"total_videos\": 50,\n \"total_chunks\": 1,\n \"chunks_size\": 1000,\n \"fps\": 30,\n \"splits\": {\n \"train\": \"0:50\"\n },\n \"data_path\": \"data/chunk-{episode_chunk:03d}/episode_{episode_index:06d}.parquet\",\n \"video_path\":… See the full description on the dataset page: https://huggingface.co/datasets/pollen-robotics/apple_storage.",
233+
"keywords": [
234+
"robotics",
235+
"apache-2.0",
236+
"10K - 100K",
237+
"parquet",
238+
"Tabular",
239+
"Time-series",
240+
"Video",
241+
"Datasets",
242+
"Dask",
243+
"Croissant",
244+
"Polars",
245+
"🇺🇸 Region: US",
246+
"LeRobot"
247+
],
248+
"license": "https://choosealicense.com/licenses/apache-2.0/",
249+
"url": "https://huggingface.co/datasets/pollen-robotics/apple_storage"
250+
}
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
{"default/split": "train", "default/observation.state": "[0.10541590303182602, 0.17775288224220276, -0.034828223288059235, -1.779552698135376, 0.19387680292129517, 0.23834092915058136, 0.5736730694770813, 2.2687575817108154, 0.3638584017753601, -0.10709299892187119, 0.17650286853313446, -1.7178312540054321, -0.23304715752601624, -0.01350467000156641, -0.23764093220233917, 0.7148351669311523, 0.30428069829940796, 0.4503992199897766, -0.3561597168445587]", "default/action": "[0.10364092141389847, 0.17767426371574402, -0.03599759191274643, -1.7781453132629395, 0.19254662096500397, 0.23872801661491394, 0.5728810429573059, 2.268928050994873, 0.337989866733551, -0.1228933185338974, 0.12340744584798813, -1.7406771183013916, -0.21982413530349731, -0.007903007790446281, -0.23672595620155334, 2.268928050994873, 0.30656060576438904, 0.45012393593788147, -0.35880979895591736]", "default/timestamp": 0.0, "default/frame_index": 0, "default/episode_index": 0, "default/index": 0, "default/task_index": 0}
2+
{"default/split": "train", "default/observation.state": "[0.10533513873815536, 0.17774134874343872, -0.03509356081485748, -1.779287338256836, 0.193406343460083, 0.23852363228797913, 0.5733709931373596, 2.2687575817108154, 0.3596370220184326, -0.10983806848526001, 0.16703356802463531, -1.7218334674835205, -0.22976867854595184, -0.011929522268474102, -0.23680514097213745, 0.89737868309021, 0.30457937717437744, 0.4505760371685028, -0.3568390905857086]", "default/action": "[0.10363447666168213, 0.17763693630695343, -0.03626960888504982, -1.7790641784667969, 0.191580668091774, 0.2376576066017151, 0.5730078816413879, 2.268928050994873, 0.3229674994945526, -0.1312551498413086, 0.09607468545436859, -1.7481365203857422, -0.20632752776145935, -0.004209509119391441, -0.2362600415945053, 2.268928050994873, 0.3067648112773895, 0.4495879113674164, -0.3593738079071045]", "default/timestamp": 0.03333333507180214, "default/frame_index": 1, "default/episode_index": 0, "default/index": 1, "default/task_index": 0}

0 commit comments

Comments
 (0)