Skip to content

Commit 7ac6192

Browse files
committed
Update ttl definition for Croissant 1.1. Minor spec clean-ups.
1 parent d4c8cdf commit 7ac6192

File tree

2 files changed

+56
-21
lines changed

2 files changed

+56
-21
lines changed

docs/croissant-spec-draft.md

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1157,11 +1157,10 @@ Sometimes, not all the data from the source is needed, but only a subset. The `E
11571157

11581158
Croissant supports a few simple transformations that can be applied on the source data:
11591159

1160-
- delimiter: split a string into an array using the supplied character.
1160+
- separator: split a string into an array using the supplied character.
11611161
- readLines: read the content of the file line by line.
11621162
- unArchive: extract the content of the archive. True by default for archive file types (zip, tgz, etc.).
1163-
- regex: A regular expression to parse the data.
1164-
- jsonPath: A JSON path to evaluate on the (JSON) data source.
1163+
- regex: A regular expression to parse the data, with one capture group that corresponds to the output value.
11651164

11661165
For example, to extract information from a filename using a regular expression, we can write:
11671166

@@ -2283,6 +2282,7 @@ This approach can be extended to other domain-specific ontologies.
22832282
"cr": "http://mlcommons.org/croissant/",
22842283
"rai": "http://mlcommons.org/croissant/RAI/",
22852284
"dct": "http://purl.org/dc/terms/",
2285+
"annotation": "cr:annotation",
22862286
"arrayShape": "cr:arrayShape",
22872287
"citeAs": "cr:citeAs",
22882288
"column": "cr:column",
@@ -2296,10 +2296,13 @@ This approach can be extended to other domain-specific ontologies.
22962296
"@id": "cr:dataType",
22972297
"@type": "@vocab"
22982298
},
2299+
"separator": "cr:separator",
2300+
"equivalentProperty": "cr:equivalentProperty",
22992301
"examples": {
23002302
"@id": "cr:examples",
23012303
"@type": "@json"
23022304
},
2305+
"excludes": "cr:excludes",
23032306
"extract": "cr:extract",
23042307
"field": "cr:field",
23052308
"fileProperty": "cr:fileProperty",
@@ -2313,14 +2316,16 @@ This approach can be extended to other domain-specific ontologies.
23132316
"key": "cr:key",
23142317
"md5": "cr:md5",
23152318
"parentField": "cr:parentField",
2316-
"path": "cr:path",
23172319
"recordSet": "cr:recordSet",
23182320
"references": "cr:references",
23192321
"regex": "cr:regex",
2320-
"replace": "cr:replace",
2322+
"readLines": "cr:readLines",
2323+
"sdVersion": "cr:sdVersion",
23212324
"separator": "cr:separator",
23222325
"source": "cr:source",
23232326
"subField": "cr:subField",
2324-
"transform": "cr:transform"
2327+
"transform": "cr:transform",
2328+
"unArchive": "cr:unArchive",
2329+
"value": "cr:value",
23252330
}
23262331
```

docs/croissant.ttl

Lines changed: 45 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,12 @@
88
croissant:FileObject a rdf:Class ;
99
rdfs:label "FileObject" ;
1010
rdfs:comment "An individual file that is part of a dataset." ;
11-
rdfs:subClassOf schema:CreativeWork .
11+
rdfs:subClassOf schema:DataDownload .
1212

1313
croissant:FileSet a rdf:Class ;
1414
rdfs:label "FileSet" ;
1515
rdfs:comment "A set of homogeneous files extracted from a container, optionally filtered by inclusion and/or exclusion filters." ;
16-
rdfs:subClassOf schema:Intangible .
16+
rdfs:subClassOf schema:DataDownload .
1717

1818
croissant:RecordSet a rdf:Class ;
1919
rdfs:label "RecordSet" ;
@@ -84,21 +84,33 @@ croissant:citeAs a rdf:Property ;
8484
schema:domainIncludes schema:Dataset ;
8585
schema:rangeIncludes schema:Text .
8686

87+
croissant:md5 a rdf:Property ;
88+
rdfs:label "md5" ;
89+
rdfs:comment "The MD5 hash of the file." ;
90+
schema:domainIncludes croissant:FileObject ;
91+
schema:rangeIncludes schema:Text .
92+
93+
croissant:sdVersion a rdf:Property ;
94+
rdfs:label "sdVersion" ;
95+
rdfs:comment "The version of the dataset metadata, which may be distinct from the version of the dataset content." ;
96+
schema:domainIncludes schema:Dataset ;
97+
schema:rangeIncludes schema:Number, schema:Text .
98+
8799
# FileObject & FileSet properties
88100

89101
croissant:containedIn a rdf:Property ;
90102
rdfs:label "containedIn" ;
91-
rdfs:comment "Another FileObject or FileSet that this one is contained in, e.g., in the case of a file extracted from an archive. When this property is present, the contentUrl is evaluated as a relative path within the container object." ;
103+
rdfs:comment "Another FileObject, FileSet or DataSource that this one is contained in, e.g., in the case of a file extracted from an archive. When this property is present, the contentUrl is evaluated as a relative path within the container object." ;
92104
schema:domainIncludes croissant:FileObject, croissant:FileSet ;
93-
schema:rangeIncludes croissant:FileObject, croissant:FileSet .
105+
schema:rangeIncludes croissant:FileObject, croissant:FileSet, croissant:DataSource .
94106

95-
croissant:includes a rdf:Property ; # Should this be named includePattern instead?
107+
croissant:includes a rdf:Property ;
96108
rdfs:label "includes" ;
97109
rdfs:comment "A glob pattern that specifies the files to include, e.g., \".jpg\", \"/foo/pic*.jpg\". The pattern is evaluated from the root of the containedIn contents." ;
98110
schema:domainIncludes croissant:FileSet ;
99111
schema:rangeIncludes schema:Text .
100112

101-
croissant:excludes a rdf:Property ; # Should this be named excludePattern instead?
113+
croissant:excludes a rdf:Property ;
102114
rdfs:label "excludes" ;
103115
rdfs:comment "A glob pattern that specifies the files to exclude. The pattern is evaluated from the root of the containedIn contents, after the includes patterns have been evaluated." ;
104116
schema:domainIncludes croissant:FileSet ;
@@ -130,6 +142,12 @@ croissant:examples a rdf:Property ;
130142
schema:domainIncludes croissant:RecordSet ;
131143
schema:rangeIncludes rdf:JSON .
132144

145+
croissant:annotation a rdf:Property ;
146+
rdfs:label "annotation" ;
147+
rdfs:comment "One or more data-level annotations that apply to the entire record or field." ;
148+
schema:domainIncludes croissant:RecordSet, croissant:Field ;
149+
schema:rangeIncludes croissant:Field .
150+
133151
croissant:source a rdf:Property ;
134152
rdfs:label "source" ;
135153
rdfs:comment "The data source of the field. This will generally reference a FileObject or FileSet's contents (e.g., a specific column of a table)." ;
@@ -142,6 +160,12 @@ croissant:dataType a rdf:Property ;
142160
schema:domainIncludes croissant:RecordSet, croissant:Field ;
143161
schema:rangeIncludes croissant:DataType .
144162

163+
croissant:value a rdf:Property ;
164+
rdfs:label "value" ;
165+
rdfs:comment "An optional constant value for the field." ;
166+
schema:domainIncludes croissant:Field ;
167+
schema:rangeIncludes rdf:JSON .
168+
145169
croissant:repeated a rdf:Property ;
146170
rdfs:label "repeated" ;
147171
rdfs:comment "If true, then the Field is a list of values of type dataType." ;
@@ -238,24 +262,30 @@ croissant:jsonPath a rdf:Property ;
238262

239263
# Transform properties
240264

241-
croissant:delimiter a rdf:Property ;
242-
rdfs:label "delimiter" ;
243-
rdfs:comment "A delimiter to use parse the data into an array." ;
265+
croissant:separator a rdf:Property ;
266+
rdfs:label "separator" ;
267+
rdfs:comment "A separator to use parse the data into an array." ;
244268
schema:domainIncludes croissant:Transform ;
245269
schema:rangeIncludes schema:Text .
246270

271+
croissant:readLines a rdf:Property ;
272+
rdfs:label "readLines" ;
273+
rdfs:comment "Read the content of the file line by line." ;
274+
schema:domainIncludes croissant:Transform ;
275+
schema:rangeIncludes schema:Boolean .
276+
277+
croissant:unArchive a rdf:Property ;
278+
rdfs:label "unArchive" ;
279+
rdfs:comment "Extract the content of the archive." ;
280+
schema:domainIncludes croissant:Transform ;
281+
schema:rangeIncludes schema:Boolean .
282+
247283
croissant:regex a rdf:Property ;
248284
rdfs:label "regex" ;
249285
rdfs:comment "A regular expression to apply to the data." ;
250286
schema:domainIncludes croissant:Transform ;
251287
schema:rangeIncludes schema:Text .
252288

253-
croissant:jsonQuery a rdf:Property ;
254-
rdfs:label "jsonQuery" ;
255-
rdfs:comment "For JSON content, a query to evaluate on the data." ;
256-
schema:domainIncludes croissant:Transform ;
257-
schema:rangeIncludes schema:Text .
258-
259289
### ML-specific definitions
260290

261291
croissant:Split a rdf:class ;

0 commit comments

Comments
 (0)