Skip to content

Commit 00f77e6

Browse files
authored
[CTM-248] Defer/skip adding auth domains for (certain) PFBs (#1075)
1 parent 44031c2 commit 00f77e6

16 files changed

Lines changed: 328 additions & 18 deletions

File tree

service/src/main/java/org/databiosphere/workspacedataservice/config/DataImportProperties.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,8 @@ public record ImportSourceConfig(
153153
List<Pattern> urls,
154154
boolean requirePrivateWorkspace,
155155
boolean requireProtectedDataPolicy,
156-
List<String> requiredAuthDomainGroups) {
156+
List<String> requiredAuthDomainGroups,
157+
boolean alwaysApplyAuthDomains) {
157158
public boolean matchesUri(URI uri) {
158159
String uriString = uri.toString();
159160
return urls.stream().anyMatch(urlPattern -> urlPattern.matcher(uriString).find());

service/src/main/java/org/databiosphere/workspacedataservice/dataimport/DefaultImportValidator.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,9 @@ private void validateDestinationWorkspace(
152152
"Data from this source cannot be imported into a public workspace.");
153153
}
154154

155-
if (!requirements.requiredAuthDomainGroups().isEmpty()) {
155+
// Apply auth domain groups if present and always applied, else defer
156+
if (requirements.alwaysApplyAuthDomains()
157+
&& !requirements.requiredAuthDomainGroups().isEmpty()) {
156158
protectedDataSupport.addAuthDomainGroupsToWorkspace(
157159
destinationWorkspaceId, requirements.requiredAuthDomainGroups());
158160
}

service/src/main/java/org/databiosphere/workspacedataservice/dataimport/ImportRequirements.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,11 @@
33
import java.util.List;
44

55
public record ImportRequirements(
6-
boolean privateWorkspace, boolean protectedDataPolicy, List<String> requiredAuthDomainGroups) {
6+
boolean privateWorkspace,
7+
boolean protectedDataPolicy,
8+
List<String> requiredAuthDomainGroups,
9+
boolean alwaysApplyAuthDomains) {
710
public ImportRequirements() {
8-
this(false, false, List.of());
11+
this(false, false, List.of(), true);
912
}
1013
}

service/src/main/java/org/databiosphere/workspacedataservice/dataimport/ImportRequirementsFactory.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,16 @@ public ImportRequirements getRequirementsForImport(URI importUri) {
3232
.flatMap(source -> source.requiredAuthDomainGroups().stream())
3333
.toList();
3434

35+
boolean alwaysApplyAuthDomains =
36+
sources != null
37+
&& sources.stream()
38+
.filter(source -> source.matchesUri(importUri))
39+
.anyMatch(ImportSourceConfig::alwaysApplyAuthDomains);
40+
3541
return new ImportRequirements(
36-
requiresPrivateWorkspace, requiresProtectedDataPolicy, requiredAuthDomainGroups);
42+
requiresPrivateWorkspace,
43+
requiresProtectedDataPolicy,
44+
requiredAuthDomainGroups,
45+
alwaysApplyAuthDomains);
3746
}
3847
}

service/src/main/java/org/databiosphere/workspacedataservice/dataimport/pfb/PfbQuartzJob.java

Lines changed: 81 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import io.micrometer.observation.Observation;
1212
import io.micrometer.observation.ObservationRegistry;
1313
import java.net.URI;
14+
import java.util.List;
1415
import java.util.Objects;
1516
import java.util.Set;
1617
import java.util.Spliterator;
@@ -26,6 +27,9 @@
2627
import org.databiosphere.workspacedataservice.dao.JobDao;
2728
import org.databiosphere.workspacedataservice.dataimport.ImportDetails;
2829
import org.databiosphere.workspacedataservice.dataimport.ImportDetailsRetriever;
30+
import org.databiosphere.workspacedataservice.dataimport.ImportRequirements;
31+
import org.databiosphere.workspacedataservice.dataimport.ImportRequirementsFactory;
32+
import org.databiosphere.workspacedataservice.dataimport.protecteddatasupport.ProtectedDataSupport;
2933
import org.databiosphere.workspacedataservice.dataimport.snapshotsupport.MultiCloudSnapshotSupportFactory;
3034
import org.databiosphere.workspacedataservice.dataimport.snapshotsupport.SnapshotSupport;
3135
import org.databiosphere.workspacedataservice.jobexec.JobDataMapReader;
@@ -66,6 +70,8 @@ public class PfbQuartzJob extends QuartzJob {
6670
private final ImportDetailsRetriever importDetailsRetriever;
6771
private final ImportMetrics importMetrics;
6872
private final DrsService drsService;
73+
private final ProtectedDataSupport protectedDataSupport;
74+
private final ImportRequirementsFactory importRequirementsFactory;
6975

7076
public PfbQuartzJob(
7177
JobDao jobDao,
@@ -78,7 +84,8 @@ public PfbQuartzJob(
7884
MultiCloudSnapshotSupportFactory snapshotSupportFactory,
7985
DataImportProperties dataImportProperties,
8086
ImportDetailsRetriever importDetailsRetriever,
81-
DrsService drsService) {
87+
DrsService drsService,
88+
ProtectedDataSupport protectedDataSupport) {
8289
super(jobDao, observationRegistry, dataImportProperties);
8390
this.recordSourceFactory = recordSourceFactory;
8491
this.recordSinkFactory = recordSinkFactory;
@@ -88,6 +95,9 @@ public PfbQuartzJob(
8895
this.importDetailsRetriever = importDetailsRetriever;
8996
this.importMetrics = importMetrics;
9097
this.drsService = drsService;
98+
this.protectedDataSupport = protectedDataSupport;
99+
this.importRequirementsFactory =
100+
new ImportRequirementsFactory(dataImportProperties.getSources());
91101
}
92102

93103
@Override
@@ -109,15 +119,36 @@ protected void executeInternal(UUID jobId, JobExecutionContext context) {
109119

110120
ImportDetails details = importDetailsRetriever.fetch(jobId, jobData, PrefixStrategy.PFB);
111121

122+
// Early PFB content inspection to determine if auth domains should be applied
123+
// This is HTTP connection #0 to the PFB.
124+
logger.info("Inspecting PFB content for auth domain requirements...");
125+
Set<UUID> snapshotIds = withPfbStream(uri, this::findSnapshots);
126+
logger.info("Found {} unique snapshot IDs in PFB", snapshotIds.size());
127+
boolean hasNresConsent = false;
128+
if (snapshotIds.isEmpty()) {
129+
hasNresConsent = withPfbStream(uri, this::hasNresConsentGroup);
130+
logger.info("NRES consent group present in PFB: {}", hasNresConsent);
131+
}
132+
ImportRequirements requirements = importRequirementsFactory.getRequirementsForImport(uri);
133+
134+
// If we skipped auth domains earlier, but now there is neither an NRES consent group nor any
135+
// snapshots,
136+
// apply the configured auth domains now.
137+
if (!requirements.requiredAuthDomainGroups().isEmpty()
138+
&& !requirements.alwaysApplyAuthDomains()
139+
&& snapshotIds.isEmpty()
140+
&& !hasNresConsent) {
141+
logger.info("Applying auth domain groups based on PFB content analysis...");
142+
protectedDataSupport.addAuthDomainGroupsToWorkspace(
143+
details.workspaceId(), requirements.requiredAuthDomainGroups());
144+
}
145+
112146
// Find all the snapshot ids in the PFB, then create or verify references from the
113147
// workspace to the snapshot for each of those snapshot ids.
114148
// This will throw an exception if there are policy conflicts between the workspace
115149
// and the snapshots.
116150
//
117-
// This is HTTP connection #1 to the PFB.
118-
logger.info("Finding snapshots in this PFB...");
119-
Set<UUID> snapshotIds = withPfbStream(uri, this::findSnapshots);
120-
151+
// This is HTTP connection #1 to the PFB (reusing snapshotIds from earlier).
121152
logger.info("Linking snapshots...");
122153
linkSnapshots(snapshotIds, details.workspaceId());
123154

@@ -261,4 +292,49 @@ private UUID maybeUuid(String input) {
261292
return null;
262293
}
263294
}
295+
296+
/** Check if the PFB contains any anvil_dataset records with consent_group set to NRES */
297+
boolean hasNresConsentGroup(DataFileStream<GenericRecord> dataStream) {
298+
Stream<GenericRecord> recordStream =
299+
StreamSupport.stream(
300+
Spliterators.spliteratorUnknownSize(dataStream.iterator(), Spliterator.ORDERED), false);
301+
302+
// Collect all consent groups from anvil_dataset records
303+
List<Object> consentGroups =
304+
recordStream
305+
.filter(rec -> "anvil_dataset".equals(rec.get("name").toString()))
306+
.map(rec -> rec.get("object"))
307+
.filter(GenericRecord.class::isInstance)
308+
.map(GenericRecord.class::cast)
309+
.filter(anvilDataset -> anvilDataset.hasField("consent_group"))
310+
.map(anvilDataset -> anvilDataset.get("consent_group"))
311+
.filter(Objects::nonNull)
312+
.filter(
313+
consentGroup -> {
314+
if (consentGroup instanceof java.util.Collection) {
315+
return !((java.util.Collection<?>) consentGroup).isEmpty();
316+
} else {
317+
return !consentGroup.toString().isEmpty();
318+
}
319+
})
320+
.toList();
321+
322+
// Must have at least one consent group AND all must be NRES
323+
if (consentGroups.isEmpty()) {
324+
return false;
325+
}
326+
327+
return consentGroups.stream()
328+
.allMatch(
329+
consentGroup -> {
330+
if (consentGroup instanceof java.util.Collection) {
331+
return ((java.util.Collection<?>) consentGroup)
332+
.stream()
333+
.filter(Objects::nonNull)
334+
.allMatch(item -> "NRES".equals(String.valueOf(item)));
335+
} else {
336+
return "NRES".equals(String.valueOf(consentGroup));
337+
}
338+
});
339+
}
264340
}

service/src/main/resources/application-prod.yml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ twds:
1919
- ^https:\/\/s3\.amazonaws.com\/gen3-biodatacatalyst-nhlbi-nih-gov-pfb-export\/
2020
- ^https:\/\/pic-sure-auth-prod-data-export\.s3\.amazonaws\.com\/
2121
- ^https:\/\/s3\.amazonaws\.com\/pic-sure-auth-prod-data-export\/
22-
- ^https:\/\/s3\.amazonaws\.com\/edu-ucsc-gi-platform-anvil-prod-storage-anvilprod\.us-east-1\/
2322
- ^https:\/\/nih-nhlbi-.*\.amazonaws\.com\/
2423
# anvil data is in TDR now, these are the old URLs
2524
- ^https:\/\/gen3-theanvil-io-pfb-export\.s3\.amazonaws\.com\/
@@ -28,12 +27,20 @@ twds:
2827
requireProtectedDataPolicy: true
2928
requiredAuthDomainGroups:
3029
- "federal_data_lockdown"
30+
alwaysApplyAuthDomains: true
3131
- urls:
3232
- ^https:\/\/storage\.googleapis\.com\/datarepo-.*-snapshot-export-bucket
3333
- ^https:\/\/s3\.amazonaws\.com\/edu-ucsc-gi-platform-hca-prod-storage-prod\.us-east-1
3434
requirePrivateWorkspace: false
3535
requireProtectedDataPolicy: false
3636
requiredAuthDomainGroups:
37+
- urls:
38+
- ^https:\/\/s3\.amazonaws\.com\/edu-ucsc-gi-platform-anvil-prod-storage-anvilprod\.us-east-1\/
39+
requirePrivateWorkspace: true
40+
requireProtectedDataPolicy: true
41+
requiredAuthDomainGroups:
42+
- "federal_data_lockdown"
43+
alwaysApplyAuthDomains: false
3744

3845
# Set the allowed hosts for drs pfb imports
3946
drs:

service/src/main/resources/application.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ twds:
160160
requirePrivateWorkspace: false
161161
requireProtectedDataPolicy: false
162162
requiredAuthDomainGroups:
163+
alwaysApplyAuthDomains: true
163164

164165
pg_dump:
165166
path: ${PGDUMP_PATH:/usr/bin/pg_dump}

service/src/test/java/org/databiosphere/workspacedataservice/dataimport/DefaultImportValidatorTest.java

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import static org.junit.jupiter.api.Assertions.assertTrue;
99
import static org.mockito.ArgumentMatchers.any;
1010
import static org.mockito.Mockito.mock;
11+
import static org.mockito.Mockito.never;
1112
import static org.mockito.Mockito.verify;
1213
import static org.mockito.Mockito.when;
1314

@@ -70,24 +71,34 @@ DefaultImportValidator getDefaultImportValidatorForTest(
7071
/* urls */ List.of(Pattern.compile("authdomain\\.pfb")),
7172
/* requirePrivateWorkspace */ false,
7273
/* requireProtectedDataPolicy */ false,
73-
/* requiredAuthDomainGroups */ List.of(authDomain)),
74+
/* requiredAuthDomainGroups */ List.of(authDomain),
75+
/* alwaysApplyAuthDomains */ true),
7476
new ImportSourceConfig(
7577
/* urls */ List.of(Pattern.compile("protected\\.pfb")),
7678
/* requirePrivateWorkspace */ false,
7779
/* requireProtectedDataPolicy */ true,
78-
/* requiredAuthDomainGroups */ List.of()),
80+
/* requiredAuthDomainGroups */ List.of(),
81+
/* alwaysApplyAuthDomains */ true),
7982
new ImportSourceConfig(
8083
/* urls */ List.of(Pattern.compile("private\\.pfb")),
8184
/* requirePrivateWorkspace */ true,
8285
/* requireProtectedDataPolicy */ false,
83-
/* requiredAuthDomainGroups */ List.of()),
86+
/* requiredAuthDomainGroups */ List.of(),
87+
/* alwaysApplyAuthDomains */ true),
8488
new ImportSourceConfig(
8589
/* urls */ List.of(
8690
Pattern.compile(
8791
"^https:\\/\\/storage\\.googleapis\\.com/datarepo-.*-snapshot-export-bucket")),
8892
/* requirePrivateWorkspace */ false,
8993
/* requireProtectedDataPolicy */ false,
90-
/* requiredAuthDomainGroups */ List.of())),
94+
/* requiredAuthDomainGroups */ List.of(),
95+
/* alwaysApplyAuthDomains */ true),
96+
new ImportSourceConfig(
97+
/* urls */ List.of(Pattern.compile("special-case\\.pfb")),
98+
/* requirePrivateWorkspace */ false,
99+
/* requireProtectedDataPolicy */ false,
100+
/* requiredAuthDomainGroups */ List.of("an-auth-domain"),
101+
/* alwaysApplyAuthDomains */ false)),
91102
/* allowedRawlsBucket */ "test-bucket",
92103
new NoopConnectivityChecker(),
93104
drsImportProperties);
@@ -272,6 +283,21 @@ void addsAuthDomains() {
272283
.addAuthDomainGroupsToWorkspace(destinationWorkspaceId, List.of(authDomain));
273284
}
274285

286+
// If specified to defer, do not add auth domains at validation time
287+
@Test
288+
void doesNotAddAuthDomainsIfDeferred() {
289+
// Arrange
290+
ImportRequestServerModel importRequest =
291+
new ImportRequestServerModel(
292+
TypeEnum.PFB, URI.create("https://files.terra.bio/special-case.pfb"));
293+
294+
// Act
295+
importValidator.validateImport(importRequest, destinationWorkspaceId);
296+
297+
// Assert
298+
verify(protectedDataSupport, never()).addAuthDomainGroupsToWorkspace(any(), any());
299+
}
300+
275301
@ParameterizedTest
276302
@MethodSource("requireProtectedWorkspacesForImportsFromConfiguredSourcesTestCases")
277303
void requireProtectedWorkspacesForImportsFromConfiguredSources(
@@ -325,7 +351,8 @@ void connectionFailureInvalidates(Exception ex) throws IOException {
325351
"^https:\\/\\/storage\\.googleapis\\.com/datarepo-.*-snapshot-export-bucket")),
326352
/* requirePrivateWorkspace */ false,
327353
/* requireProtectedDataPolicy */ false,
328-
/* requiredAuthDomainGroups */ List.of())),
354+
/* requiredAuthDomainGroups */ List.of(),
355+
/* alwaysApplyAuthDomains */ true)),
329356
/* allowedRawlsBucket */ "test-bucket",
330357
mockConnectivityChecker,
331358
drsImportProperties);

service/src/test/java/org/databiosphere/workspacedataservice/dataimport/ImportRequirementsFactoryTest.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@ void setup() {
4545
"^https:\\/\\/s3\\.amazonaws\\.com\\/gen3-theanvil-io-pfb-export\\/")),
4646
true, // requirePrivateWorkspace
4747
true, // requireProtectedDataPolicy
48-
List.of("mock-auth-group")));
48+
List.of("mock-auth-group"),
49+
true));
4950
when(dataImportProperties.getSources()).thenReturn(mockSources);
5051
}
5152

0 commit comments

Comments
 (0)