Skip to content

Commit 92e548b

Browse files
committed
feat: web tech api
1 parent 417b7fc commit 92e548b

File tree

12 files changed

+854
-4
lines changed

12 files changed

+854
-4
lines changed

api/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
<parent>
77
<groupId>top.bella</groupId>
88
<artifactId>openai-java</artifactId>
9-
<version>0.23.81</version>
9+
<version>0.23.82</version>
1010
</parent>
1111
<packaging>jar</packaging>
1212
<artifactId>openai-api</artifactId>
Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
package com.theokanning.openai.web;
2+
3+
import com.fasterxml.jackson.annotation.JsonProperty;
4+
import com.fasterxml.jackson.annotation.JsonValue;
5+
import com.theokanning.openai.assistants.IUssrRequest;
6+
import lombok.AllArgsConstructor;
7+
import lombok.Data;
8+
import lombok.NoArgsConstructor;
9+
import lombok.experimental.SuperBuilder;
10+
11+
import javax.validation.constraints.Min;
12+
import javax.validation.constraints.NotBlank;
13+
import java.io.Serializable;
14+
import java.util.List;
15+
16+
/**
17+
* Web Crawl Request based on Tavily Crawl API Provides comprehensive web crawling functionality with configurable depth and filtering options
18+
*/
19+
@Data
20+
@SuperBuilder
21+
@NoArgsConstructor
22+
public class WebCrawlRequest implements IUssrRequest, Serializable {
23+
private static final long serialVersionUID = 1L;
24+
25+
/**
26+
* The root URL to begin the crawl (required) Example: "docs.tavily.com"
27+
*/
28+
@NotBlank(message = "URL cannot be blank")
29+
private String url;
30+
31+
/**
32+
* Model to use for the crawl request
33+
*/
34+
private String model;
35+
36+
/**
37+
* A unique identifier representing your end-user
38+
*/
39+
private String user;
40+
41+
/**
42+
* Natural language instructions for the crawler When specified, the mapping cost increases to 2 API credits per 10 successful pages instead of 1
43+
* API credit per 10 pages Example: "Find all pages about the Python SDK"
44+
*/
45+
private String instructions;
46+
47+
/**
48+
* Max depth of the crawl. Defines how far from the base URL the crawler can explore Default: 1
49+
*/
50+
@JsonProperty("max_depth")
51+
private Integer maxDepth = 1;
52+
53+
/**
54+
* Max number of links to follow per level of the tree (i.e., per page) Default: 20
55+
*/
56+
@JsonProperty("max_breadth")
57+
private Integer maxBreadth = 20;
58+
59+
/**
60+
* Total number of links the crawler will process before stopping Default: 50
61+
*/
62+
private Integer limit = 50;
63+
64+
/**
65+
* Regex patterns to select only URLs with specific path patterns Example: ["/docs/.*", "/api/v1.*"]
66+
*/
67+
@JsonProperty("select_paths")
68+
private List<String> selectPaths;
69+
70+
/**
71+
* Regex patterns to select crawling to specific domains or subdomains Example: ["^docs\\.example\\.com$"]
72+
*/
73+
@JsonProperty("select_domains")
74+
private List<String> selectDomains;
75+
76+
/**
77+
* Regex patterns to exclude URLs with specific path patterns Example: ["/private/.*", "/admin/.*"]
78+
*/
79+
@JsonProperty("exclude_paths")
80+
private List<String> excludePaths;
81+
82+
/**
83+
* Regex patterns to exclude specific domains or subdomains from crawling Example: ["^private\\.example\\.com$"]
84+
*/
85+
@JsonProperty("exclude_domains")
86+
private List<String> excludeDomains;
87+
88+
/**
89+
* Whether to include external domain links in the final results list Default: true
90+
*/
91+
@JsonProperty("allow_external")
92+
private Boolean allowExternal = true;
93+
94+
/**
95+
* Whether to include images in the crawl results Default: false
96+
*/
97+
@JsonProperty("include_images")
98+
private Boolean includeImages = false;
99+
100+
/**
101+
* Advanced extraction retrieves more data, including tables and embedded content, with higher success but may increase latency - basic: costs 1
102+
* credit per 5 successful extractions - advanced: costs 2 credits per 5 successful extractions Default: basic
103+
*/
104+
@JsonProperty("extract_depth")
105+
private ExtractDepth extractDepth = ExtractDepth.BASIC;
106+
107+
/**
108+
* The format of the extracted web page content - markdown: returns content in markdown format - text: returns plain text and may increase latency
109+
* Default: markdown
110+
*/
111+
private Format format = Format.MARKDOWN;
112+
113+
/**
114+
* Whether to include the favicon URL for each result Default: false
115+
*/
116+
@JsonProperty("include_favicon")
117+
private Boolean includeFavicon = false;
118+
119+
/**
120+
* Extract depth enum for extraction complexity
121+
*/
122+
@AllArgsConstructor
123+
public enum ExtractDepth {
124+
BASIC("basic"),
125+
ADVANCED("advanced");
126+
127+
private final String value;
128+
129+
@JsonValue
130+
public String getValue() {
131+
return value;
132+
}
133+
}
134+
135+
/**
136+
* Format enum for content extraction format
137+
*/
138+
@AllArgsConstructor
139+
public enum Format {
140+
MARKDOWN("markdown"),
141+
TEXT("text");
142+
143+
private final String value;
144+
145+
@JsonValue
146+
public String getValue() {
147+
return value;
148+
}
149+
}
150+
}
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
package com.theokanning.openai.web;
2+
3+
import com.fasterxml.jackson.annotation.JsonProperty;
4+
import lombok.Data;
5+
import lombok.NoArgsConstructor;
6+
import lombok.experimental.SuperBuilder;
7+
8+
import java.io.Serializable;
9+
import java.util.List;
10+
11+
/**
12+
* Web Crawl Response based on Tavily Crawl API Contains crawled content, metadata, and timing information from web crawling
13+
*/
14+
@Data
15+
@SuperBuilder
16+
@NoArgsConstructor
17+
public class WebCrawlResponse implements Serializable {
18+
private static final long serialVersionUID = 1L;
19+
20+
/**
21+
* The base URL that was crawled Example: "docs.tavily.com"
22+
*/
23+
@JsonProperty("base_url")
24+
private String baseUrl;
25+
26+
/**
27+
* A list of extracted content from the crawled URLs
28+
*/
29+
private List<CrawlResult> results;
30+
31+
/**
32+
* Time in seconds it took to complete the request Example: 1.23
33+
*/
34+
@JsonProperty("response_time")
35+
private Double responseTime;
36+
37+
/**
38+
* A unique request identifier you can share with customer support to help resolve issues with specific requests Example:
39+
* "123e4567-e89b-12d3-a456-426614174111"
40+
*/
41+
@JsonProperty("request_id")
42+
private String requestId;
43+
44+
45+
/**
46+
* Crawl Result class representing individual crawled page results
47+
*/
48+
@Data
49+
@SuperBuilder
50+
@NoArgsConstructor
51+
public static class CrawlResult implements Serializable {
52+
private static final long serialVersionUID = 1L;
53+
54+
/**
55+
* The URL that was crawled Example: "https://docs.tavily.com"
56+
*/
57+
private String url;
58+
59+
/**
60+
* The full content extracted from the page This contains the complete extracted content in the specified format (markdown or text)
61+
*/
62+
@JsonProperty("raw_content")
63+
private String rawContent;
64+
65+
/**
66+
* The favicon URL for the result Only present if include_favicon was set to true in the request Example:
67+
* "https://mintlify.s3-us-west-1.amazonaws.com/tavilyai/_generated/favicon/apple-touch-icon.png?v=3"
68+
*/
69+
private String favicon;
70+
}
71+
}
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
package com.theokanning.openai.web;
2+
3+
import com.fasterxml.jackson.annotation.JsonProperty;
4+
import com.fasterxml.jackson.annotation.JsonValue;
5+
import com.theokanning.openai.assistants.IUssrRequest;
6+
import lombok.AllArgsConstructor;
7+
import lombok.Data;
8+
import lombok.NoArgsConstructor;
9+
import lombok.experimental.SuperBuilder;
10+
11+
import javax.validation.constraints.NotEmpty;
12+
import java.io.Serializable;
13+
import java.util.List;
14+
15+
/**
16+
* Web Extract Request based on Tavily Extract API
17+
* Extract web page content from one or more specified URLs using Tavily Extract
18+
*/
19+
@Data
20+
@SuperBuilder
21+
@NoArgsConstructor
22+
public class WebExtractRequest implements IUssrRequest, Serializable {
23+
private static final long serialVersionUID = 1L;
24+
25+
/**
26+
* The URLs to extract content from (required)
27+
* Example: ["https://en.wikipedia.org/wiki/Artificial_intelligence"]
28+
*/
29+
@NotEmpty(message = "URLs cannot be empty")
30+
private List<String> urls;
31+
32+
/**
33+
* Model to use for the extract request
34+
*/
35+
private String model;
36+
37+
/**
38+
* A unique identifier representing your end-user
39+
*/
40+
private String user;
41+
42+
/**
43+
* Include a list of images extracted from the URLs in the response
44+
* Default: false
45+
*/
46+
@JsonProperty("include_images")
47+
private Boolean includeImages = false;
48+
49+
/**
50+
* Whether to include the favicon URL for each result
51+
* Default: false
52+
*/
53+
@JsonProperty("include_favicon")
54+
private Boolean includeFavicon = false;
55+
56+
/**
57+
* The depth of the extraction process
58+
* - basic: costs 1 credit per 5 successful URL extractions
59+
* - advanced: costs 2 credits per 5 successful URL extractions
60+
* Default: basic
61+
*/
62+
@JsonProperty("extract_depth")
63+
private ExtractDepth extractDepth = ExtractDepth.BASIC;
64+
65+
/**
66+
* The format of the extracted web page content
67+
* - markdown: returns content in markdown format
68+
* - text: returns plain text and may increase latency
69+
* Default: markdown
70+
*/
71+
private Format format = Format.MARKDOWN;
72+
73+
/**
74+
* Maximum time in seconds to wait for the URL extraction before timing out
75+
* Must be between 1.0 and 60.0 seconds
76+
* If not specified, default timeouts are applied based on extract_depth:
77+
* - 10 seconds for basic extraction
78+
* - 30 seconds for advanced extraction
79+
*/
80+
private Double timeout;
81+
82+
/**
83+
* Extract depth enum for extraction complexity
84+
*/
85+
@AllArgsConstructor
86+
public enum ExtractDepth {
87+
BASIC("basic"),
88+
ADVANCED("advanced");
89+
90+
private final String value;
91+
92+
@JsonValue
93+
public String getValue() {
94+
return value;
95+
}
96+
}
97+
98+
/**
99+
* Format enum for content extraction format
100+
*/
101+
@AllArgsConstructor
102+
public enum Format {
103+
MARKDOWN("markdown"),
104+
TEXT("text");
105+
106+
private final String value;
107+
108+
@JsonValue
109+
public String getValue() {
110+
return value;
111+
}
112+
}
113+
}

0 commit comments

Comments
 (0)