diff --git a/api-reference/go/datasets/Datapoints.Ingest.mdx b/api-reference/go/datasets/Datapoints.Ingest.mdx
index da941fe..761cc6d 100644
--- a/api-reference/go/datasets/Datapoints.Ingest.mdx
+++ b/api-reference/go/datasets/Datapoints.Ingest.mdx
@@ -20,7 +20,7 @@ Ingest data points into a collection.
The id of the collection
-
+
The datapoints to ingest
@@ -50,7 +50,7 @@ datapoints := []*v1.Modis{
ingestResponse, err := client.Datapoints.Ingest(ctx,
collectionID,
- datapoints
+ &datapoints
false,
)
```
diff --git a/api-reference/go/datasets/Datapoints.Query.mdx b/api-reference/go/datasets/Datapoints.Query.mdx
index 02f9d98..437abf5 100644
--- a/api-reference/go/datasets/Datapoints.Query.mdx
+++ b/api-reference/go/datasets/Datapoints.Query.mdx
@@ -7,20 +7,20 @@ icon: layer-group
```go
func (datapointClient) Query(
ctx context.Context,
- collectionID uuid.UUID,
+ collectionIDs []uuid.UUID,
options ...datasets.QueryOption,
) iter.Seq2[[]byte, error]
```
-Query a range of data points in this collection in a specified interval.
+Query a range of data points in the specified collections in a specified interval.
The datapoints are lazily queried and returned as a sequence of bytes.
The output sequence can be transformed into a typed `proto.Message` using [CollectAs](/api-reference/go/datasets/CollectAs) or [As](/api-reference/go/datasets/As) functions.
## Parameters
-
- The id of the collection
+
+ The ids of the collections to query
Options for querying data points
@@ -58,7 +58,7 @@ endDate := time.Date(2021, 2, 24, 0, 0, 0, 0, time.UTC)
queryInterval := query.NewTimeInterval(startDate, endDate)
datapoints, err := datasets.CollectAs[*v1.Sentinel1Sar](
- client.Datapoints.Query(ctx, collectionID, datasets.WithTemporalExtent(queryInterval)),
+ client.Datapoints.Query(ctx, []uuid.UUID{collection.ID}, datasets.WithTemporalExtent(queryInterval)),
)
```
diff --git a/api-reference/go/datasets/Datapoints.QueryInto.mdx b/api-reference/go/datasets/Datapoints.QueryInto.mdx
index ad54f01..85e565d 100644
--- a/api-reference/go/datasets/Datapoints.QueryInto.mdx
+++ b/api-reference/go/datasets/Datapoints.QueryInto.mdx
@@ -7,22 +7,22 @@ icon: layer-group
```go
func (datapointClient) QueryInto(
ctx context.Context,
- collectionID uuid.UUID,
+ collectionIDs []uuid.UUID,
datapoints any,
options ...datasets.QueryOption,
) error
```
-Query a range of data points in this collection in a specified interval.
+Query a range of data points in the specified collections in a specified interval.
QueryInto is a convenience function for [Query](/api-reference/go/datasets/Datapoints.Query), when no manual pagination or custom iteration is required.
## Parameters
-
- The id of the collection
+
+ The ids of the collections to query
-
+
The datapoints to query into
@@ -62,7 +62,7 @@ queryInterval := query.NewTimeInterval(startDate, endDate)
var datapoints []*v1.Sentinel1Sar
err := client.Datapoints.QueryInto(ctx,
- collectionID,
+ []uuid.UUID{collection.ID},
&datapoints,
datasets.WithTemporalExtent(queryInterval),
)
diff --git a/datasets/delete.mdx b/datasets/delete.mdx
index 43b8e26..52762ab 100644
--- a/datasets/delete.mdx
+++ b/datasets/delete.mdx
@@ -5,8 +5,6 @@ description: Learn how to delete data points from Tilebox datasets.
icon: trash-can
---
-import { CodeOutputHeader } from '/snippets/components.mdx';
-
You need to have write permission on the collection to be able to delete datapoints.
@@ -72,7 +70,6 @@ func main() {
}
```
-
```plaintext Output
Deleted 2 data points.
@@ -126,7 +123,6 @@ if err != nil {
slog.Info("Deleted data points", slog.Int64("deleted", numDeleted))
```
-
```plaintext Output
Deleted 104 data points.
diff --git a/datasets/ingest.mdx b/datasets/ingest.mdx
index 76126c6..9d82966 100644
--- a/datasets/ingest.mdx
+++ b/datasets/ingest.mdx
@@ -5,8 +5,6 @@ description: Learn how to ingest data into a Tilebox dataset.
icon: up-from-bracket
---
-import { CodeOutputHeader } from '/snippets/components.mdx';
-
You need to have write permission on the collection to be able to ingest data.
@@ -51,13 +49,42 @@ client = Client()
dataset = client.dataset("my_org.my_custom_dataset")
collection = dataset.get_or_create_collection("Measurements")
```
+```go Go
+package main
+
+import (
+ "context"
+ "log"
+
+ "github.com/tilebox/tilebox-go/datasets/v1"
+)
+
+func main() {
+ ctx := context.Background()
+ client := datasets.NewClient()
+
+ dataset, err := client.Datasets.Get(ctx, "my_org.my_custom_dataset")
+ if err != nil {
+ log.Fatalf("Failed to get dataset: %v", err)
+ }
+
+ collection, err := client.Collections.GetOrCreate(ctx, dataset.ID, "Measurements")
+ if err != nil {
+ log.Fatalf("Failed to get collection: %v", err)
+ }
+}
+```
## Preparing data for ingestion
+Ingestion can be done either in Python or Go.
+
+### Python
+
[`collection.ingest`](/api-reference/python/tilebox.datasets/Collection.ingest) supports a wide range of input types. Below is an example of using either a `pandas.DataFrame` or an `xarray.Dataset` as input.
-### pandas.DataFrame
+#### pandas.DataFrame
A [pandas.DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html) is a representation of two-dimensional, potentially heterogeneous tabular data. It's a powerful tool for working with structured data, and Tilebox supports it as input for `ingest`.
@@ -86,9 +113,9 @@ data = pd.DataFrame({
print(data)
```
-
+
-```plaintext Python
+```plaintext Output
time value sensor precise_time sensor_history
0 2025-03-28T11:44:23Z 45.16 A 2025-03-28T11:44:23.345761444Z [-12.15, 13.45, -8.2, 16.5, 45.16]
1 2025-03-28T11:45:19Z 273.15 B 2025-03-28T11:45:19.128742312Z [300.16, 280.12, 273.15]
@@ -107,9 +134,9 @@ collection.ingest(data)
print(collection.info())
```
-
+
-```plaintext Python
+```plaintext Output
Measurements: [2025-03-28T11:44:23.000 UTC, 2025-03-28T11:45:19.000 UTC] (2 data points)
```
@@ -118,7 +145,7 @@ Measurements: [2025-03-28T11:44:23.000 UTC, 2025-03-28T11:45:19.000 UTC] (2 data
You can now also head on over to the [Tilebox Console](/console) and view the newly ingested data points there.
-### xarray.Dataset
+#### xarray.Dataset
[`xarray.Dataset`](/sdks/python/xarray) is the default format in which Tilebox Datasets returns data when
[querying data](/datasets/query) from a collection.
@@ -149,9 +176,9 @@ data = xr.Dataset({
print(data)
```
-
+
-```plaintext Python
+```plaintext Output
Size: 504B
Dimensions: (time: 2, n_sensor_history: 5)
Coordinates:
@@ -184,17 +211,47 @@ collection.ingest(data)
print(collection.info())
```
-
+
-```plaintext Python
+```plaintext Output
OtherMeasurements: [2025-03-28T11:46:13.000 UTC, 2025-03-28T11:46:54.000 UTC] (2 data points)
```
+### Go
+
+[`Client.Datapoints.Ingest`](/api-reference/go/datasets/Datapoints.Ingest) supports ingestion of data points in the form of a slice of protobuf messages.
+
+#### Protobuf
+
+Protobuf is Google's language-neutral, platform-neutral, extensible mechanism for serializing structured data.
+
+More details on protobuf can be found in the [protobuf section](/sdks/go/protobuf).
+
+In the example below, `v1.Modis` type has been generated using [tilebox-generate](https://github.com/tilebox/tilebox-generate) as described in the [protobuf section](/sdks/go/protobuf).
+
+```go Go
+datapoints := []*v1.Modis{
+ v1.Modis_builder{
+ Time: timestamppb.New(time.Now()),
+ GranuleName: proto.String("Granule 1"),
+ }.Build(),
+ v1.Modis_builder{
+ Time: timestamppb.New(time.Now().Add(-5 * time.Hour)),
+ GranuleName: proto.String("Past Granule 2"),
+ }.Build(),
+}
+
+ingestResponse, err := client.Datapoints.Ingest(ctx,
+ collectionID,
+ &datapoints
+ false,
+)
+```
+
## Copying or moving data
-Since [collection.load](/datasets/query) returns a `xarray.Dataset`, and `ingest` takes such a dataset as input you
-can easily copy or move data from one collection to another.
+Since `ingest` takes `query`'s output as input, you can easily copy or move data from one collection to another.
Copying data like this also works across datasets in case the dataset schemas are compatible.
@@ -211,10 +268,51 @@ dest_collection.ingest(data_to_copy) # copy the data to the other collection
# To verify it now contains 4 datapoints (2 we ingested already, and 2 we copied just now)
print(dest_collection.info())
```
+```go Go
+dataset, err := client.Datasets.Get(ctx, "my_org.my_custom_dataset")
+if err != nil {
+ log.Fatalf("Failed to get dataset: %v", err)
+}
+
+srcCollection, err := client.Collections.GetOrCreate(ctx, dataset.ID, "Measurements")
+if err != nil {
+ log.Fatalf("Failed to get collection: %v", err)
+}
+
+startDate := time.Date(2025, time.March, 28, 0, 0, 0, 0, time.UTC)
+endDate := time.Date(2025, time.March, 29, 0, 0, 0, 0, time.UTC)
+
+var dataToCopy []*v1.MyCustomDataset
+err = client.Datapoints.QueryInto(ctx,
+ []uuid.UUID{srcCollection.ID}, &dataToCopy,
+ datasets.WithTemporalExtent(query.NewTimeInterval(startDate, endDate)),
+)
+if err != nil {
+ log.Fatalf("Failed to query datapoints: %v", err)
+}
+
+destCollection, err := client.Collections.GetOrCreate(ctx, dataset.ID, "OtherMeasurements")
+if err != nil {
+ log.Fatalf("Failed to get collection: %v", err)
+}
+
+// copy the data to the other collection
+_, err = client.Datapoints.Ingest(ctx, destCollection.ID, &dataToCopy, false)
+if err != nil {
+ log.Fatalf("Failed to ingest datapoints: %v", err)
+}
+
+// To verify it now contains 4 datapoints (2 we ingested already, and 2 we copied just now)
+updatedDestCollection, err := client.Collections.Get(ctx, dataset.ID, "OtherMeasurements")
+if err != nil {
+ log.Fatalf("Failed to get collection: %v", err)
+}
+slog.Info("Updated collection", slog.String("collection", updatedDestCollection.String()))
+```
-
+
-```plaintext Python
+```plaintext Output
OtherMeasurements: [2025-03-28T11:44:23.000 UTC, 2025-03-28T11:46:54.000 UTC] (4 data points)
```
@@ -259,10 +357,36 @@ collection.ingest(data, allow_existing=False)
# will be skipped
collection.ingest(data, allow_existing=True) # no-op
```
+```go Go
+datapoints := []*v1.MyCustomDataset{
+ v1.MyCustomDataset_builder{
+ Time: timestamppb.New(time.Date(2025, time.March, 28, 11, 45, 19, 0, time.UTC)),
+ Value: proto.Float64(45.16),
+ Sensor: proto.String("A"),
+ PreciseTime: timestamppb.New(time.Date(2025, time.March, 28, 11, 44, 23, 345761444, time.UTC)),
+ SensorHistory: []float64{-12.15, 13.45, -8.2, 16.5, 45.16},
+ }.Build(),
+}
+
+// we already ingested the same data point previously
+_, err = client.Datapoints.Ingest(ctx, collection.ID, &datapoints, false)
+if err != nil {
+ log.Fatalf("Failed to ingest datapoints: %v", err)
+}
+
+// we can still ingest it, by setting allowExisting to true
+// but the total number of datapoints will still be the same
+// as before in that case, since it already exists and therefore
+// will be skipped
+_, err = client.Datapoints.Ingest(ctx, collection.ID, &datapoints, true) // no-op
+if err != nil {
+ log.Fatalf("Failed to ingest datapoints: %v", err)
+}
+```
-
+
-```plaintext Python
+```plaintext Output
ArgumentError: found existing datapoints with same id, refusing to ingest with "allow_existing=false"
```
@@ -272,72 +396,4 @@ ArgumentError: found existing datapoints with same id, refusing to ingest with "
Through the usage of `xarray` and `pandas` you can also easily ingest existing datasets available in file
formats, such as CSV, [Parquet](https://parquet.apache.org/), [Feather](https://arrow.apache.org/docs/python/feather.html) and more.
-### CSV
-
-Comma-separated values (CSV) is a common file format for tabular data. It's widely used in data science. Tilebox
-supports CSV ingestion using the `pandas.read_csv` function.
-
-Assuming you have a CSV file named `data.csv` with the following content. If you want to follow along, you can
-download the file [here](https://storage.googleapis.com/tbx-web-assets-2bad228/docs/data-samples/ingestion_data.csv).
-
-```csv ingestion_data.csv
-time,value,sensor,precise_time,sensor_history,some_unwanted_column
-2025-03-28T11:44:23Z,45.16,A,2025-03-28T11:44:23.345761444Z,"[-12.15, 13.45, -8.2, 16.5, 45.16]","Unsupported"
-2025-03-28T11:45:19Z,273.15,B,2025-03-28T11:45:19.128742312Z,"[300.16, 280.12, 273.15]","Unsupported"
-```
-
-This data already conforms to the schema of the `MyCustomDataset` dataset, except for `some_unwanted_column` which
-you want to drop before you ingest it. Here is how this could look like:
-
-
-```python Python
-import pandas as pd
-
-data = pd.read_csv("ingestion_data.csv")
-data = data.drop(columns=["some_unwanted_column"])
-
-collection = dataset.get_or_create_collection("CSVMeasurements")
-collection.ingest(data)
-```
-
-
-### Parquet
-
-[Apache Parquet](https://parquet.apache.org/) is an open source, column-oriented data file format designed for efficient data storage and retrieval.
-Tilebox supports Parquet ingestion using the `pandas.read_parquet` function.
-
-The parquet file used in this example [is available here](https://storage.googleapis.com/tbx-web-assets-2bad228/docs/data-samples/ingestion_data.parquet).
-
-
-```python Python
-import pandas as pd
-
-data = pd.read_parquet("ingestion_data.parquet")
-
-# our data already conforms to the schema of the MyCustomDataset
-# dataset, so lets ingest it
-collection = dataset.get_or_create_collection("ParquetMeasurements")
-collection.ingest(data)
-```
-
-
-### Feather
-
-[Feather](https://arrow.apache.org/docs/python/feather.html) is a file format originating from the Apache Arrow project,
-designed for storing tabular data in a fast and memory-efficient way. It's supported by many programming languages,
-including Python. Tilebox supports Feather ingestion using the `pandas.read_feather` function.
-
-The feather file used in this example [is available here](https://storage.googleapis.com/tbx-web-assets-2bad228/docs/data-samples/ingestion_data.feather).
-
-
-```python Python
-import pandas as pd
-
-data = pd.read_feather("ingestion_data.feather")
-
-# our data already conforms to the schema of the MyCustomDataset
-# dataset, so lets ingest it
-collection = dataset.get_or_create_collection("FeatherMeasurements")
-collection.ingest(data)
-```
-
+Check out the [Ingestion from common file formats](/guides/datasets/ingest-format) guide for examples of how to achieve this.
diff --git a/guides/datasets/ingest-format.mdx b/guides/datasets/ingest-format.mdx
new file mode 100644
index 0000000..00b8f23
--- /dev/null
+++ b/guides/datasets/ingest-format.mdx
@@ -0,0 +1,78 @@
+---
+title: Ingesting from common file formats
+description: Learn how to ingest data from common file formats into Tilebox
+icon: file-binary
+---
+
+Through the usage of `xarray` and `pandas` you can also easily ingest existing datasets available in file
+formats, such as CSV, [Parquet](https://parquet.apache.org/), [Feather](https://arrow.apache.org/docs/python/feather.html) and more.
+
+## CSV
+
+Comma-separated values (CSV) is a common file format for tabular data. It's widely used in data science. Tilebox
+supports CSV ingestion using the `pandas.read_csv` function.
+
+Assuming you have a CSV file named `data.csv` with the following content. If you want to follow along, you can
+download the file [here](https://storage.googleapis.com/tbx-web-assets-2bad228/docs/data-samples/ingestion_data.csv).
+
+```csv ingestion_data.csv
+time,value,sensor,precise_time,sensor_history,some_unwanted_column
+2025-03-28T11:44:23Z,45.16,A,2025-03-28T11:44:23.345761444Z,"[-12.15, 13.45, -8.2, 16.5, 45.16]","Unsupported"
+2025-03-28T11:45:19Z,273.15,B,2025-03-28T11:45:19.128742312Z,"[300.16, 280.12, 273.15]","Unsupported"
+```
+
+This data already conforms to the schema of the `MyCustomDataset` dataset, except for `some_unwanted_column` which
+you want to drop before you ingest it. Here is how this could look like:
+
+
+```python Python
+import pandas as pd
+
+data = pd.read_csv("ingestion_data.csv")
+data = data.drop(columns=["some_unwanted_column"])
+
+collection = dataset.get_or_create_collection("CSVMeasurements")
+collection.ingest(data)
+```
+
+
+## Parquet
+
+[Apache Parquet](https://parquet.apache.org/) is an open source, column-oriented data file format designed for efficient data storage and retrieval.
+Tilebox supports Parquet ingestion using the `pandas.read_parquet` function.
+
+The parquet file used in this example [is available here](https://storage.googleapis.com/tbx-web-assets-2bad228/docs/data-samples/ingestion_data.parquet).
+
+
+```python Python
+import pandas as pd
+
+data = pd.read_parquet("ingestion_data.parquet")
+
+# our data already conforms to the schema of the MyCustomDataset
+# dataset, so lets ingest it
+collection = dataset.get_or_create_collection("ParquetMeasurements")
+collection.ingest(data)
+```
+
+
+## Feather
+
+[Feather](https://arrow.apache.org/docs/python/feather.html) is a file format originating from the Apache Arrow project,
+designed for storing tabular data in a fast and memory-efficient way. It's supported by many programming languages,
+including Python. Tilebox supports Feather ingestion using the `pandas.read_feather` function.
+
+The feather file used in this example [is available here](https://storage.googleapis.com/tbx-web-assets-2bad228/docs/data-samples/ingestion_data.feather).
+
+
+```python Python
+import pandas as pd
+
+data = pd.read_feather("ingestion_data.feather")
+
+# our data already conforms to the schema of the MyCustomDataset
+# dataset, so lets ingest it
+collection = dataset.get_or_create_collection("FeatherMeasurements")
+collection.ingest(data)
+```
+
diff --git a/guides/datasets/ingest.mdx b/guides/datasets/ingest.mdx
index 1b3e526..6d72dc5 100644
--- a/guides/datasets/ingest.mdx
+++ b/guides/datasets/ingest.mdx
@@ -1,11 +1,9 @@
---
title: Ingesting data
description: Learn how to ingest an existing dataset into Tilebox
-icon: folder-arrow-up
+icon: up-from-bracket
---
-import { CodeOutputHeader } from '/snippets/components.mdx';
-
This guide is also available as a Google Colab notebook. Click here for an interactive version.
@@ -64,9 +62,9 @@ modis_data = gpd.read_parquet("modis_MCD12Q1.geoparquet")
modis_data.head(5)
```
-
+
-```plaintext Python
+```plaintext Output
time end_time granule_name geometry horizontal_tile_number vertical_tile_number tile_id file_size checksum checksum_type day_night_flag browse_granule_id published_at
0 2001-01-01 00:00:00+00:00 2001-12-31 23:59:59+00:00 MCD12Q1.A2001001.h00v08.061.2022146024956.hdf POLYGON ((-180 10, -180 0, -170 0, -172.62252 ... 0 8 51000008 275957 941243048 CKSUM Day None 2022-06-23 10:54:43.824000+00:00
1 2001-01-01 00:00:00+00:00 2001-12-31 23:59:59+00:00 MCD12Q1.A2001001.h00v09.061.2022146024922.hdf POLYGON ((-180 0, -180 -10, -172.62252 -10, -1... 0 9 51000009 285389 3014510714 CKSUM Day None 2022-06-23 10:54:44.697000+00:00
@@ -168,9 +166,9 @@ datapoint_ids = collection.ingest(modis_data)
print(f"Successfully ingested {len(datapoint_ids)} datapoints!")
```
-
+
-```plaintext Python
+```plaintext Output
Successfully ingested 7245 datapoints!
```
@@ -189,9 +187,9 @@ data = collection.load(("2015-01-01", "2020-01-01"))
data
```
-
+
-```plaintext Python
+```plaintext Output
Size: 403kB
Dimensions: (time: 1575)
Coordinates:
diff --git a/mint.json b/mint.json
index c747ffc..1be512f 100644
--- a/mint.json
+++ b/mint.json
@@ -181,7 +181,8 @@
"group": "Datasets",
"pages": [
"guides/datasets/create",
- "guides/datasets/ingest"
+ "guides/datasets/ingest",
+ "guides/datasets/ingest-format"
]
},
{