Skip to content

Commit c78c7d5

Browse files
jjoyce0510John JoycecursoragentJohn JoyceJohn Joyce
authored
feat(): Support context document import from local filesystem and GitHub, Notion, Confluence (#16903)
Co-authored-by: John Joyce <john@ip-192-168-1-212.us-west-2.compute.internal> Co-authored-by: Cursor <cursoragent@cursor.com> Co-authored-by: John Joyce <john@Mac-5837.lan> Co-authored-by: John Joyce <john@Mac-5917.lan> Co-authored-by: John Joyce <john@Mac-6389.lan>
1 parent a0147e7 commit c78c7d5

115 files changed

Lines changed: 6576 additions & 108 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -445,6 +445,7 @@ public class GmsGraphQLEngine {
445445
private ConnectionService connectionService;
446446
private AssertionService assertionService;
447447
private final DocumentService documentService;
448+
private final com.linkedin.metadata.service.docimport.DocumentImportService documentImportService;
448449
private final EntityVersioningService entityVersioningService;
449450
private final ApplicationService applicationService;
450451
private final PageTemplateService pageTemplateService;
@@ -595,6 +596,7 @@ public GmsGraphQLEngine(final GmsGraphQLEngineArgs args) {
595596
this.connectionService = args.connectionService;
596597
this.assertionService = args.assertionService;
597598
this.documentService = args.documentService;
599+
this.documentImportService = args.documentImportService;
598600
this.entityVersioningService = args.entityVersioningService;
599601

600602
this.businessAttributeService = args.businessAttributeService;
@@ -3140,7 +3142,8 @@ private void configureDocumentResolvers(final RuntimeWiring.Builder builder) {
31403142
this.entityService,
31413143
this.graphClient,
31423144
entityRegistry,
3143-
this.timelineService)
3145+
this.timelineService,
3146+
this.documentImportService)
31443147
.configureResolvers(builder);
31453148
}
31463149

datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngineArgs.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
import com.linkedin.metadata.service.QueryService;
4141
import com.linkedin.metadata.service.SettingsService;
4242
import com.linkedin.metadata.service.ViewService;
43+
import com.linkedin.metadata.service.docimport.DocumentImportService;
4344
import com.linkedin.metadata.timeline.TimelineService;
4445
import com.linkedin.metadata.timeseries.TimeseriesAspectService;
4546
import com.linkedin.metadata.utils.aws.S3Util;
@@ -101,6 +102,7 @@ public class GmsGraphQLEngineArgs {
101102
ConnectionService connectionService;
102103
AssertionService assertionService;
103104
DocumentService documentService;
105+
DocumentImportService documentImportService;
104106
EntityVersioningService entityVersioningService;
105107
ApplicationService applicationService;
106108
PageTemplateService pageTemplateService;

datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/knowledge/DocumentResolvers.java

Lines changed: 47 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,11 @@
1111
import com.linkedin.metadata.entity.EntityService;
1212
import com.linkedin.metadata.models.registry.EntityRegistry;
1313
import com.linkedin.metadata.service.DocumentService;
14+
import com.linkedin.metadata.service.docimport.DocumentImportService;
1415
import com.linkedin.metadata.timeline.TimelineService;
1516
import graphql.schema.idl.RuntimeWiring;
1617
import javax.annotation.Nonnull;
18+
import javax.annotation.Nullable;
1719

1820
/** Configures resolvers for Document query, mutation, and type wiring. */
1921
public class DocumentResolvers {
@@ -31,6 +33,7 @@ public class DocumentResolvers {
3133
private final com.linkedin.metadata.graph.GraphClient graphClient;
3234
private final EntityRegistry entityRegistry;
3335
private final TimelineService timelineService;
36+
@Nullable private final DocumentImportService documentImportService;
3437

3538
public DocumentResolvers(
3639
@Nonnull DocumentService documentService,
@@ -42,7 +45,8 @@ public DocumentResolvers(
4245
@Nonnull EntityService entityService,
4346
@Nonnull com.linkedin.metadata.graph.GraphClient graphClient,
4447
@Nonnull EntityRegistry entityRegistry,
45-
@Nonnull TimelineService timelineService) {
48+
@Nonnull TimelineService timelineService,
49+
@Nullable DocumentImportService documentImportService) {
4650
this.documentService = documentService;
4751
this.entityTypes = entityTypes;
4852
this.documentType = documentType;
@@ -53,6 +57,7 @@ public DocumentResolvers(
5357
this.graphClient = graphClient;
5458
this.entityRegistry = entityRegistry;
5559
this.timelineService = timelineService;
60+
this.documentImportService = documentImportService;
5661
}
5762

5863
public void configureResolvers(final RuntimeWiring.Builder builder) {
@@ -73,40 +78,47 @@ public void configureResolvers(final RuntimeWiring.Builder builder) {
7378
// Mutation resolvers
7479
builder.type(
7580
MUTATION_TYPE,
76-
typeWiring ->
77-
typeWiring
78-
.dataFetcher(
79-
"createDocument",
80-
new com.linkedin.datahub.graphql.resolvers.knowledge.CreateDocumentResolver(
81-
documentService, entityService))
82-
.dataFetcher(
83-
"updateDocumentContents",
84-
new com.linkedin.datahub.graphql.resolvers.knowledge
85-
.UpdateDocumentContentsResolver(documentService))
86-
.dataFetcher(
87-
"updateDocumentRelatedEntities",
88-
new com.linkedin.datahub.graphql.resolvers.knowledge
89-
.UpdateDocumentRelatedEntitiesResolver(documentService))
90-
.dataFetcher(
91-
"moveDocument",
92-
new com.linkedin.datahub.graphql.resolvers.knowledge.MoveDocumentResolver(
93-
documentService))
94-
.dataFetcher(
95-
"deleteDocument",
96-
new com.linkedin.datahub.graphql.resolvers.knowledge.DeleteDocumentResolver(
97-
documentService))
98-
.dataFetcher(
99-
"updateDocumentStatus",
100-
new com.linkedin.datahub.graphql.resolvers.knowledge
101-
.UpdateDocumentStatusResolver(documentService))
102-
.dataFetcher(
103-
"updateDocumentSubType",
104-
new com.linkedin.datahub.graphql.resolvers.knowledge
105-
.UpdateDocumentSubTypeResolver(documentService))
106-
.dataFetcher(
107-
"updateDocumentSettings",
108-
new com.linkedin.datahub.graphql.resolvers.knowledge
109-
.UpdateDocumentSettingsResolver(documentService)));
81+
typeWiring -> {
82+
typeWiring
83+
.dataFetcher(
84+
"createDocument",
85+
new com.linkedin.datahub.graphql.resolvers.knowledge.CreateDocumentResolver(
86+
documentService, entityService))
87+
.dataFetcher(
88+
"updateDocumentContents",
89+
new com.linkedin.datahub.graphql.resolvers.knowledge
90+
.UpdateDocumentContentsResolver(documentService))
91+
.dataFetcher(
92+
"updateDocumentRelatedEntities",
93+
new com.linkedin.datahub.graphql.resolvers.knowledge
94+
.UpdateDocumentRelatedEntitiesResolver(documentService))
95+
.dataFetcher(
96+
"moveDocument",
97+
new com.linkedin.datahub.graphql.resolvers.knowledge.MoveDocumentResolver(
98+
documentService))
99+
.dataFetcher(
100+
"deleteDocument",
101+
new com.linkedin.datahub.graphql.resolvers.knowledge.DeleteDocumentResolver(
102+
documentService))
103+
.dataFetcher(
104+
"updateDocumentStatus",
105+
new com.linkedin.datahub.graphql.resolvers.knowledge.UpdateDocumentStatusResolver(
106+
documentService))
107+
.dataFetcher(
108+
"updateDocumentSubType",
109+
new com.linkedin.datahub.graphql.resolvers.knowledge
110+
.UpdateDocumentSubTypeResolver(documentService))
111+
.dataFetcher(
112+
"updateDocumentSettings",
113+
new com.linkedin.datahub.graphql.resolvers.knowledge
114+
.UpdateDocumentSettingsResolver(documentService));
115+
if (documentImportService != null) {
116+
typeWiring.dataFetcher(
117+
"importDocumentsFromFiles",
118+
new ImportDocumentsFromFilesResolver(documentImportService));
119+
}
120+
return typeWiring;
121+
});
110122

111123
// Type wiring for Document root
112124
builder.type(
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
package com.linkedin.datahub.graphql.resolvers.knowledge;
2+
3+
import static com.linkedin.datahub.graphql.resolvers.ResolverUtils.bindArgument;
4+
5+
import com.linkedin.common.urn.Urn;
6+
import com.linkedin.datahub.graphql.QueryContext;
7+
import com.linkedin.datahub.graphql.authorization.AuthorizationUtils;
8+
import com.linkedin.datahub.graphql.concurrency.GraphQLConcurrencyUtils;
9+
import com.linkedin.datahub.graphql.exception.AuthorizationException;
10+
import com.linkedin.datahub.graphql.generated.DocumentFileInput;
11+
import com.linkedin.datahub.graphql.generated.ImportDocumentsFromFilesInput;
12+
import com.linkedin.datahub.graphql.generated.ImportDocumentsResult;
13+
import com.linkedin.metadata.service.docimport.DocumentCandidate;
14+
import com.linkedin.metadata.service.docimport.DocumentImportService;
15+
import com.linkedin.metadata.service.docimport.ImportResult;
16+
import com.linkedin.metadata.service.docimport.ImportUseCase;
17+
import com.linkedin.metadata.service.docimport.TextExtractors;
18+
import graphql.schema.DataFetcher;
19+
import graphql.schema.DataFetchingEnvironment;
20+
import java.util.ArrayList;
21+
import java.util.HashMap;
22+
import java.util.List;
23+
import java.util.Map;
24+
import java.util.Objects;
25+
import java.util.concurrent.CompletableFuture;
26+
import javax.annotation.Nonnull;
27+
import lombok.extern.slf4j.Slf4j;
28+
29+
/**
30+
* Resolver for importing documents from pre-parsed file contents. The frontend is responsible for
31+
* text extraction (e.g. FileReader for text files, mammoth for DOCX). Requires MANAGE_DOCUMENTS
32+
* privilege.
33+
*/
34+
@Slf4j
35+
public class ImportDocumentsFromFilesResolver
36+
implements DataFetcher<CompletableFuture<ImportDocumentsResult>> {
37+
38+
private final DocumentImportService _importService;
39+
40+
public ImportDocumentsFromFilesResolver(@Nonnull final DocumentImportService importService) {
41+
this._importService = Objects.requireNonNull(importService, "importService must not be null");
42+
}
43+
44+
@Override
45+
public CompletableFuture<ImportDocumentsResult> get(DataFetchingEnvironment environment)
46+
throws Exception {
47+
final QueryContext context = environment.getContext();
48+
final ImportDocumentsFromFilesInput input =
49+
bindArgument(environment.getArgument("input"), ImportDocumentsFromFilesInput.class);
50+
51+
return GraphQLConcurrencyUtils.supplyAsync(
52+
() -> {
53+
if (!AuthorizationUtils.canManageDocuments(context)) {
54+
throw new AuthorizationException(
55+
"Unauthorized to import documents. Requires MANAGE_DOCUMENTS privilege.");
56+
}
57+
58+
try {
59+
boolean showInGlobal =
60+
input.getShowInGlobalContext() != null ? input.getShowInGlobalContext() : true;
61+
ImportUseCase useCase =
62+
input.getUseCase() != null
63+
? ImportUseCase.fromString(input.getUseCase().toString())
64+
: ImportUseCase.CONTEXT_DOCUMENT;
65+
66+
Urn parentUrn =
67+
input.getParentDocumentUrn() != null
68+
? Urn.createFromString(input.getParentDocumentUrn())
69+
: null;
70+
71+
List<DocumentCandidate> candidates = new ArrayList<>();
72+
for (DocumentFileInput doc : input.getDocuments()) {
73+
String filename = doc.getFileName();
74+
String text = doc.getContent();
75+
if (text == null || text.isBlank()) {
76+
log.warn("Skipping empty document: {}", filename);
77+
continue;
78+
}
79+
80+
String ext = TextExtractors.getExtension(filename);
81+
Map<String, String> props = new HashMap<>();
82+
props.put("import_source", "file_upload");
83+
props.put("original_filename", filename);
84+
props.put("file_extension", ext);
85+
86+
candidates.add(
87+
DocumentCandidate.builder()
88+
.title(TextExtractors.titleFromFilename(filename))
89+
.text(text)
90+
.sourceId(DocumentImportService.makeFileSourceId(filename))
91+
.customProperties(props)
92+
.build());
93+
}
94+
95+
ImportResult result =
96+
_importService.importDocuments(
97+
context.getOperationContext(),
98+
candidates,
99+
useCase,
100+
showInGlobal,
101+
parentUrn,
102+
Urn.createFromString(context.getActorUrn()));
103+
104+
return ImportDocumentsResultMapper.toGraphQL(result);
105+
} catch (AuthorizationException e) {
106+
throw e;
107+
} catch (Exception e) {
108+
log.error("Failed to import documents from files: {}", e.getMessage());
109+
throw new RuntimeException(
110+
String.format("Failed to import documents from files: %s", e.getMessage()), e);
111+
}
112+
},
113+
this.getClass().getSimpleName(),
114+
"get");
115+
}
116+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
package com.linkedin.datahub.graphql.resolvers.knowledge;
2+
3+
import com.linkedin.datahub.graphql.generated.ImportDocumentsResult;
4+
import com.linkedin.metadata.service.docimport.ImportResult;
5+
import java.util.ArrayList;
6+
import javax.annotation.Nonnull;
7+
8+
/** Maps service-layer document import results to GraphQL types. */
9+
public final class ImportDocumentsResultMapper {
10+
11+
private ImportDocumentsResultMapper() {}
12+
13+
@Nonnull
14+
public static ImportDocumentsResult toGraphQL(@Nonnull ImportResult result) {
15+
ImportDocumentsResult graphqlResult = new ImportDocumentsResult();
16+
graphqlResult.setCreatedCount(result.getCreatedCount());
17+
graphqlResult.setUpdatedCount(result.getUpdatedCount());
18+
graphqlResult.setFailedCount(result.getFailedCount());
19+
graphqlResult.setErrors(new ArrayList<>(result.getErrors()));
20+
graphqlResult.setDocumentUrns(new ArrayList<>(result.getDocumentUrns()));
21+
return graphqlResult;
22+
}
23+
}

0 commit comments

Comments
 (0)