Skip to content

Commit 7a20ce0

Browse files
authored
feat: add parsing APIs (read, parse, get results) (#6)
1 parent ac48771 commit 7a20ce0

File tree

11 files changed

+1197
-0
lines changed

11 files changed

+1197
-0
lines changed

go.mod

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
module github.com/sixt/tensorlake-go
22

33
go 1.25
4+
5+
require github.com/google/jsonschema-go v0.4.2

go.sum

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
2+
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
3+
github.com/google/jsonschema-go v0.4.2 h1:tmrUohrwoLZZS/P3x7ex0WAVknEkBZM46iALbcqoRA8=
4+
github.com/google/jsonschema-go v0.4.2/go.mod h1:r5quNTdLOYEz95Ru18zA0ydNbBuYoo9tgaYcxEYhJVE=

opt.go

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
// Copyright 2025 SIXT SE
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package tensorlake
16+
17+
import (
18+
"encoding/json"
19+
"fmt"
20+
)
21+
22+
// UnionValues is a union of values of type T.
23+
// It can be a single value or an array of values.
24+
type UnionValues[T any] []T
25+
26+
// UnmarshalJSON unmarshals a JSON array or a single value into a UnionValues.
27+
func (v *UnionValues[T]) UnmarshalJSON(b []byte) error {
28+
// Try a single value
29+
var single T
30+
if err := json.Unmarshal(b, &single); err == nil {
31+
*v = []T{single}
32+
return nil
33+
}
34+
35+
// Try an array of values
36+
var arr []T
37+
if err := json.Unmarshal(b, &arr); err == nil {
38+
*v = arr
39+
return nil
40+
}
41+
42+
return fmt.Errorf("value must be a single value or an array of values: %s", string(b))
43+
}
44+
45+
// MarshalJSON marshals a UnionValues into a JSON array.
46+
func (v UnionValues[T]) MarshalJSON() ([]byte, error) {
47+
return json.Marshal([]T(v))
48+
}

opt_test.go

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
// Copyright 2025 SIXT SE
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package tensorlake
16+
17+
import (
18+
"encoding/json"
19+
"reflect"
20+
"testing"
21+
)
22+
23+
func TestValueOrValuesUnmarshalJSON(t *testing.T) {
24+
tests := []struct {
25+
value string
26+
expected []int
27+
}{
28+
{
29+
value: "1",
30+
expected: []int{1},
31+
},
32+
{
33+
value: "[1, 2, 3]",
34+
expected: []int{1, 2, 3},
35+
},
36+
}
37+
38+
for _, test := range tests {
39+
var v UnionValues[int]
40+
if err := json.Unmarshal([]byte(test.value), &v); err != nil {
41+
t.Fatalf("failed to unmarshal: %v", err)
42+
}
43+
if !reflect.DeepEqual(v, UnionValues[int](test.expected)) {
44+
t.Fatalf("expected %v, got %v", test.expected, v)
45+
}
46+
}
47+
}
48+
49+
func TestValueOrValuesMarshalJSON(t *testing.T) {
50+
type testType struct {
51+
Value UnionValues[int] `json:"value"`
52+
}
53+
54+
tests := []struct {
55+
value string
56+
expected testType
57+
}{
58+
{
59+
value: `{"value": 1}`,
60+
expected: testType{
61+
Value: UnionValues[int]{1},
62+
},
63+
},
64+
{
65+
value: `{"value": [1, 2, 3]}`,
66+
expected: testType{
67+
Value: UnionValues[int]{1, 2, 3},
68+
},
69+
},
70+
}
71+
72+
for _, test := range tests {
73+
var v testType
74+
if err := json.Unmarshal([]byte(test.value), &v); err != nil {
75+
t.Fatalf("failed to unmarshal: %v", err)
76+
}
77+
if !reflect.DeepEqual(v, test.expected) {
78+
t.Fatalf("expected %+v, got %+v", test.expected, v)
79+
}
80+
}
81+
}

parse_get.go

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
// Copyright 2025 SIXT SE
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package tensorlake
16+
17+
import (
18+
"context"
19+
"encoding/json"
20+
"fmt"
21+
"io"
22+
"net/http"
23+
)
24+
25+
// GetParseResult retrieves the result of a parse job.
26+
// The response will include: 1) parsed content (markdown or pages);
27+
// 2) structured extraction results (if schemas are provided during the parse request);
28+
// 3) page classification results (if page classifications are provided during the parse request).
29+
//
30+
// When the job finishes successfully, the response will contain pages
31+
// (chunks of the page) chunks (text chunks extracted from the document),
32+
// structured data (every schema_name provided in the parse request as a key).
33+
func (c *Client) GetParseResult(ctx context.Context, parseId string) (*ParseResult, error) {
34+
reqURL := fmt.Sprintf("%s/parse/%s", c.baseURL, parseId)
35+
36+
req, err := http.NewRequestWithContext(ctx, http.MethodGet, reqURL, nil)
37+
if err != nil {
38+
return nil, fmt.Errorf("failed to create request: %w", err)
39+
}
40+
41+
return do(c, req, func(r io.Reader) (*ParseResult, error) {
42+
var result ParseResult
43+
if err := json.NewDecoder(r).Decode(&result); err != nil {
44+
return nil, fmt.Errorf("failed to decode response: %w", err)
45+
}
46+
return &result, nil
47+
})
48+
}

parse_parse.go

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
// Copyright 2025 SIXT SE
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package tensorlake
16+
17+
import (
18+
"bytes"
19+
"context"
20+
"encoding/json"
21+
"fmt"
22+
"io"
23+
"log/slog"
24+
"net/http"
25+
)
26+
27+
type ParseDocumentRequest struct {
28+
FileSource
29+
30+
// ParsingOptions contains the properties of this object define
31+
// the configuration for the document parsing process.
32+
//
33+
// Tensorlake provides sane defaults that work well for most
34+
// documents, so this object is not required. However, every document
35+
// is different, and you may want to customize the parsing process to
36+
// better suit your needs.
37+
ParsingOptions *ParsingOptions `json:"parsing_options,omitempty"`
38+
39+
// The properties of this object help to extend the output of the document
40+
// parsing process with additional information.
41+
//
42+
// This includes summarization of tables and figures, which can help to
43+
// provide a more comprehensive understanding of the document.
44+
//
45+
// This object is not required, and the API will use default settings if it
46+
// is not present.
47+
EnrichmentOptions *EnrichmentOptions `json:"enrichment_options,omitempty"`
48+
49+
// StructuredExtractionOptions is the options for structured data extraction.
50+
//
51+
// The properties of this object define the configuration for structured
52+
// data extraction.
53+
//
54+
// If this object is present, the API will perform structured data
55+
// extraction on the document.
56+
StructuredExtractionOptions []StructuredExtractionOptions `json:"structured_extraction_options,omitempty"`
57+
58+
// PageClassificationOptions is the options for page classification.
59+
//
60+
// The properties of this object define the configuration for page
61+
// classify.
62+
//
63+
// If this object is present, the API will perform page classify on
64+
// the document.
65+
PageClassificationOptions []PageClassConfig `json:"page_classifications,omitempty"`
66+
67+
// PageRange is a comma-separated list of page numbers or
68+
// ranges to parse (e.g., '1,2,3-5'). Default: all pages.
69+
// Examples: "1-5,8,10"
70+
PageRange string `json:"page_range,omitempty"`
71+
72+
// Additional metadata to identify the read request. The labels are
73+
// returned in the read response.
74+
Labels map[string]string `json:"labels,omitempty"`
75+
76+
// MimeType is the MIME type of the file. This is used to determine how to process the file.
77+
MimeType MimeType `json:"mime_type,omitempty"`
78+
}
79+
80+
// ParseDocumentResponse represents the response from the ParseDocument operation.
81+
//
82+
// ParseId is the unique identifier for the parse job.
83+
// CreatedAt is the creation date and time of the parse job.
84+
type ParseDocumentResponse struct {
85+
// ParseId is the unique identifier for the parse job.
86+
// This is the ID that can be used to track the status of the parse job.
87+
// Used in the GET /documents/v2/parse/{parse_id} endpoint to retrieve
88+
// the status and results of the parse job.
89+
ParseId string `json:"parse_id"`
90+
// CreatedAt is the creation date and time of the parse job.
91+
CreatedAt string `json:"created_at"`
92+
}
93+
94+
// ParseDocument submits a document for comprehensive parsing (read, extract, and classify).
95+
func (c *Client) ParseDocument(ctx context.Context, in *ParseDocumentRequest) (*ParseDocumentResponse, error) {
96+
if !in.SourceProvided() {
97+
return nil, fmt.Errorf("exactly one of file_id, file_url, or raw_text must be provided")
98+
}
99+
100+
body, _ := json.Marshal(in) // Impossible to fail?
101+
102+
slog.Info("ParseDocument request", "request", string(body))
103+
104+
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+"/parse", bytes.NewReader(body))
105+
if err != nil {
106+
return nil, fmt.Errorf("failed to create request: %w", err)
107+
}
108+
109+
return do(c, req, func(r io.Reader) (*ParseDocumentResponse, error) {
110+
var result ParseDocumentResponse
111+
if err := json.NewDecoder(r).Decode(&result); err != nil {
112+
return nil, fmt.Errorf("failed to decode response: %w", err)
113+
}
114+
return &result, nil
115+
})
116+
}

0 commit comments

Comments
 (0)