36
36
import java .time .Instant ;
37
37
import java .util .ArrayList ;
38
38
import java .util .Collections ;
39
+ import java .util .Comparator ;
39
40
import java .util .HashSet ;
40
41
import java .util .List ;
41
42
import java .util .Set ;
@@ -72,6 +73,8 @@ public class DwhFiles {
72
73
73
74
private static final String INCREMENTAL_DIR = "incremental_run" ;
74
75
76
+ static final String TIMESTAMP_PREFIX = "_TIMESTAMP_" ;
77
+
75
78
// TODO: It is probably better if we build all DWH files related operations using Beam's
76
79
// filesystem API such that when a new filesystem is registered, it automatically works
77
80
// everywhere in our code. Note that currently we have hardcoded the valid schema in some places,
@@ -121,6 +124,10 @@ static DwhFiles forRoot(String dwhRoot, FhirContext fhirContext) {
121
124
return new DwhFiles (dwhRoot , fhirContext );
122
125
}
123
126
127
+ public static String safeTimestampSuffix () {
128
+ return Instant .now ().toString ().replace (":" , "-" ).replace ("-" , "_" ).replace ("." , "_" );
129
+ }
130
+
124
131
public String getRoot () {
125
132
return dwhRoot ;
126
133
}
@@ -145,50 +152,76 @@ public String getFilePattern(String resourceType) {
145
152
"%s*%s" , getResourcePath (resourceType ).toString (), ParquetUtil .PARQUET_EXTENSION );
146
153
}
147
154
155
+ // TODO: Move this to a util class and make it non-static.
148
156
/**
149
- * This returns the default incremental run path; each incremental run is relative to a full path,
150
- * hence we put this directory under the full-run root.
157
+ * Returns all the child directories under the given base directory which are 1-level deep. Note
158
+ * in many cloud/distributed file-systems, we do not have "directories"; there are only buckets
159
+ * and files in those buckets. We use file-seprators (e.g., `/`) to simulate the concept of
160
+ * directories. So for example, this method returns an empty set if `baseDir` is `bucket/test` and
161
+ * the only file in that bucket is `bucket/test/dir1/dir2/file.txt`. If `baseDir` is
162
+ * `bucket/test/dir1`, in the above example, `dir2` is returned.
151
163
*
152
- * @return the default incremental run path
164
+ * @param baseDir the path under which "directories" are looked for.
165
+ * @return The list of all child directories under the base directory
166
+ * @throws IOException
153
167
*/
154
- public ResourceId getIncrementalRunPath () {
155
- return FileSystems .matchNewResource (getRoot (), true )
156
- .resolve (INCREMENTAL_DIR , StandardResolveOptions .RESOLVE_DIRECTORY );
157
- }
158
-
159
- /** This is used when we want to keep a backup of the old incremental run output. */
160
- public ResourceId getIncrementalRunPathWithTimestamp () {
161
- return FileSystems .matchNewResource (getRoot (), true )
162
- .resolve (
163
- String .format ("%s_old_%d" , INCREMENTAL_DIR , System .currentTimeMillis ()),
164
- StandardResolveOptions .RESOLVE_DIRECTORY );
168
+ static Set <ResourceId > getAllChildDirectories (String baseDir ) throws IOException {
169
+ String fileSeparator = getFileSeparatorForDwhFiles (baseDir );
170
+ // Avoid using ResourceId.resolve(..) method to resolve the files when the path contains glob
171
+ // expressions with multiple special characters like **, */* etc as this api only supports
172
+ // single special characters like `*` or `..`. Rather use the FileSystems.match(..) if the path
173
+ // contains glob expressions.
174
+ List <MatchResult > matchResultList =
175
+ FileSystems .match (
176
+ List .of (
177
+ getPathEndingWithFileSeparator (baseDir , fileSeparator )
178
+ + "*"
179
+ + fileSeparator
180
+ + "*" ));
181
+ Set <ResourceId > childDirectories = new HashSet <>();
182
+ for (MatchResult matchResult : matchResultList ) {
183
+ if (matchResult .status () == Status .OK && !matchResult .metadata ().isEmpty ()) {
184
+ for (Metadata metadata : matchResult .metadata ()) {
185
+ childDirectories .add (metadata .resourceId ().getCurrentDirectory ());
186
+ }
187
+ } else if (matchResult .status () == Status .ERROR ) {
188
+ String errorMessage = String .format ("Error matching files under directory %s" , baseDir );
189
+ log .error (errorMessage );
190
+ throw new IOException (errorMessage );
191
+ }
192
+ }
193
+ log .info ("Child directories of {} are {}" , baseDir , childDirectories );
194
+ return childDirectories ;
165
195
}
166
196
167
197
/**
168
- * Similar to {@link #getIncrementalRunPath} but also checks if that directory exists and if so,
169
- * moves it to {@link #getIncrementalRunPathWithTimestamp()}.
198
+ * Also see {@link #newIncrementalRunPath()}
170
199
*
171
- * @return same as {@link #getIncrementalRunPath()}
172
- * @throws IOException if the directory move fails
200
+ * @return the current incremental run path if one found; null otherwise.
173
201
*/
174
- public ResourceId newIncrementalRunPath () throws IOException {
175
- ResourceId incPath = getIncrementalRunPath ();
176
- if (hasIncrementalDir ()) {
177
- ResourceId movePath = getIncrementalRunPathWithTimestamp ();
178
- log .info ("Moving the old {} directory to {}" , INCREMENTAL_DIR , movePath );
179
- FileSystems .rename (Collections .singletonList (incPath ), Collections .singletonList (movePath ));
180
- }
181
- return incPath ;
202
+ @ Nullable
203
+ public ResourceId getLatestIncrementalRunPath () throws IOException {
204
+ List <ResourceId > dirs =
205
+ getAllChildDirectories (getRoot ()).stream ()
206
+ .filter (dir -> dir .getFilename ().contains (INCREMENTAL_DIR + TIMESTAMP_PREFIX ))
207
+ .collect (Collectors .toList ());
208
+ if (dirs .isEmpty ()) return null ;
209
+
210
+ Collections .sort (dirs , Comparator .comparing (ResourceId ::toString ));
211
+ return dirs .get (dirs .size () - 1 );
182
212
}
183
213
184
214
/**
185
- * @return true iff there is already an incremental run subdirectory in this DWH.
215
+ * This returns a new incremental-run path based on the current timestamp. Note that each
216
+ * incremental-run is relative to a full-run, hence we put this directory under the full-run root.
217
+ *
218
+ * @return a new incremental run path based on the current timestamp.
186
219
*/
187
- public boolean hasIncrementalDir () throws IOException {
188
- List < MatchResult > matches =
189
- FileSystems . matchResources ( Collections . singletonList ( getIncrementalRunPath ()));
190
- MatchResult matchResult = Iterables . getOnlyElement ( matches );
191
- return matchResult . status () == Status . OK ;
220
+ public ResourceId newIncrementalRunPath () {
221
+ return FileSystems . matchNewResource ( getRoot (), true )
222
+ . resolve (
223
+ String . format ( "%s%s%s" , INCREMENTAL_DIR , TIMESTAMP_PREFIX , safeTimestampSuffix ()),
224
+ StandardResolveOptions . RESOLVE_DIRECTORY ) ;
192
225
}
193
226
194
227
public Set <String > findNonEmptyResourceDirs () throws IOException {
0 commit comments