Skip to content

Extended croissant draft for CCF crawls #961

@handecelikkanat

Description

@handecelikkanat

Here is an extended croissant draft for CCF crawls. Please give feedback.

@benjelloun @wumpus FYI.

What we dont have syntax for atm, and related issues:

  • Lineage: Lineage / provenance representation #738
  • Size information for FileSets: We have size information for whole File Sets (the whole warc files), afaiu FileSet now doesnt have contentSize info. (FileObject has.)
  • Parquet schemas for Columnar index: Afaiu parquet schemas are not supported yet - Will it be?

Draft Croissant

EDITS:

  • Corrected version to include build-version (2025-10-16)
  • Updated croissant specs version to http://mlcommons.org/croissant/1.1, noticed some datasets already use it, though it is not available at this link yet. (2025-10-16)
  • Added base IRI: "https://data.commoncrawl.org/crawl-data/CC-MAIN-YYYY-WW/" (2025-10-16)
{
  "@context": {
    "@base": "cr_base_iri/",
    "@language": "en",
    "@vocab": "https://schema.org/",
    "sc": "https://schema.org/",
    "cr": "http://mlcommons.org/croissant/",
    "rai": "http://mlcommons.org/croissant/RAI/",
    "dct": "http://purl.org/dc/terms/",
    "citeAs": "cr:citeAs",
    "column": "cr:column",
    "conformsTo": "dct:conformsTo",
    "data": {
      "@id": "cr:data",
      "@type": "@json"
    },
    "dataType": {
      "@id": "cr:dataType",
      "@type": "@vocab"
    },
    "examples": {
      "@id": "cr:examples",
      "@type": "@json"
    },
    "extract": "cr:extract",
    "field": "cr:field",
    "fileProperty": "cr:fileProperty",
    "fileObject": "cr:fileObject",
    "fileSet": "cr:fileSet",
    "format": "cr:format",
    "includes": "cr:includes",
    "isLiveDataset": "cr:isLiveDataset",
    "jsonPath": "cr:jsonPath",
    "key": "cr:key",
    "md5": "cr:md5",
    "parentField": "cr:parentField",
    "path": "cr:path",
    "recordSet": "cr:recordSet",
    "references": "cr:references",
    "regex": "cr:regex",
    "repeated": "cr:repeated",
    "replace": "cr:replace",
    "separator": "cr:separator",
    "source": "cr:source",
    "subField": "cr:subField",
    "transform": "cr:transform",
    "@base": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/"
  },
  "@type": "sc:Dataset",
  "conformsTo": "http://mlcommons.org/croissant/1.1",
  "name": "CC-MAIN-2022-05",
  "description": "Common Crawl January 2022 Crawl Archive",
  "license": "https://commoncrawl.org/terms-of-use",
  "url": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/index.html",
  "creator": {
    "@type": "Organization",
    "name": "The Common Crawl Foundation",
    "url": "https://commoncrawl.org/"
  },
  "citeAs": "https://commoncrawl.org/",
  "version": "1.0.0+20251015",
  "datePublished": "2022-01-29T15:24:05Z",
  "temporalCoverage": "2022-01-16T09:31:37Z/2022-01-29T15:24:05Z",
  "distribution": [
    {
      "@type": "cr:FileObject",
      "@id": "warc.paths.gz",
      "contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/warc.paths.gz",
      "encodingFormat": "application/gzip",
      "sha256": "e0131030d08e07d93362feae152f3854e23093f623edd6757bb270a92f91b55b"
    },
    {
      "@type": "cr:FileSet",
      "@id": "warc-paths",
      "containedIn": {
        "@id": "warc.paths.gz"
      },
      "encodingFormat": "application/warc",
      "includes": "*.warc.gz"
    },
    {
      "@type": "cr:FileObject",
      "@id": "wat.paths.gz",
      "contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/wat.paths.gz",
      "encodingFormat": "application/gzip",
      "sha256": "c61ecb0b91767f334621da59a84ecf2b858018e5cb0c3f36789a6ac77cb6ca54"
    },
    {
      "@type": "cr:FileSet",
      "@id": "wat-paths",
      "containedIn": {
        "@id": "wat.paths.gz"
      },
      "encodingFormat": "application/warc",
      "includes": "*.warc.wat.gz"
    },
    {
      "@type": "cr:FileObject",
      "@id": "wet.paths.gz",
      "contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/wet.paths.gz",
      "encodingFormat": "application/gzip",
      "sha256": "36650ea4871ee5ed5da0a5d23ab4d6050dc2d2792c86980d76e1167d8e354a25"
    },
    {
      "@type": "cr:FileSet",
      "@id": "wet-paths",
      "containedIn": {
        "@id": "wet.paths.gz"
      },
      "encodingFormat": "application/warc",
      "includes": "*.warc.wet.gz"
    },
    {
      "@type": "cr:FileObject",
      "@id": "robotstxt.paths.gz",
      "contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/robotstxt.paths.gz",
      "encodingFormat": "application/gzip",
      "sha256": "239c08056c67673e53d1e55d1903669c8f0f5ce3d867ba56357c7f87f5387621"
    },
    {
      "@type": "cr:FileSet",
      "@id": "robotstxt-paths",
      "containedIn": {
        "@id": "robotstxt.paths.gz"
      },
      "encodingFormat": "application/warc",
      "includes": "*.warc.gz"
    },
    {
      "@type": "cr:FileObject",
      "@id": "non200responses.paths.gz",
      "contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/non200responses.paths.gz",
      "encodingFormat": "application/gzip",
      "sha256": "69a973bdc7c9f6ed60f80286fdc881db4265208adc6f0ba05488e8d52f4d9738"
    },
    {
      "@type": "cr:FileSet",
      "@id": "non200responses-paths",
      "containedIn": {
        "@id": "non200responses.paths.gz"
      },
      "encodingFormat": "application/warc",
      "includes": "*.warc.gz"
    },
    {
      "@type": "cr:FileObject",
      "@id": "cc-index.paths.gz",
      "contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/cc-index.paths.gz",
      "encodingFormat": "application/gzip",
      "sha256": "29e5ad77d5be5c2e0c8adc25231174966d82257a78d4e3d9441a10e55c0086e0"
    },
    {
      "@type": "cr:FileSet",
      "@id": "cc-index-paths",
      "containedIn": {
        "@id": "cc-index.paths.gz"
      },
      "encodingFormat": "application/gzip",
      "includes": "*.gz"
    },
    {
      "@type": "cr:FileObject",
      "@id": "cc-index-table.paths.gz",
      "contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/cc-index-table.paths.gz",
      "encodingFormat": "application/gzip",
      "sha256": "e38cdc860dfb831d9f69ef7d16ce5bcdb7e862d57611335ad2b689c9a4cf5436"
    },
    {
      "@type": "cr:FileSet",
      "@id": "cc-index-table-paths",
      "containedIn": {
        "@id": "cc-index-table.paths.gz"
      },
      "encodingFormat": "application/parquet",
      "includes": "*.gz.parquet"
    }
  ]
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions