17
17
package gobblin .data .management .conversion .hive .converter ;
18
18
19
19
import java .io .IOException ;
20
+ import java .util .Arrays ;
20
21
import java .util .Collections ;
21
22
import java .util .HashMap ;
22
23
import java .util .LinkedHashMap ;
39
40
import org .apache .hadoop .hive .metastore .api .FieldSchema ;
40
41
import org .apache .hadoop .hive .metastore .api .NoSuchObjectException ;
41
42
import org .apache .hadoop .hive .metastore .api .Table ;
43
+ import org .apache .hadoop .hive .ql .metadata .HiveException ;
42
44
import org .apache .hadoop .hive .ql .metadata .Partition ;
43
45
import org .apache .thrift .TException ;
44
46
@@ -135,6 +137,14 @@ public String getConfigPrefix() {
135
137
public static final String HIVE_DATASET_DESTINATION_SKIP_SETGROUP = "hive.dataset.destination.skip.setGroup" ;
136
138
public static final boolean DEFAULT_HIVE_DATASET_DESTINATION_SKIP_SETGROUP = false ;
137
139
140
+ /**
141
+ * If the property is set to true then partition dir is overwritten,
142
+ * else a new time-stamped partition dir is created to avoid breaking in-flight queries
143
+ * Check gobblin.data.management.retention.Avro2OrcStaleDatasetCleaner to clean stale directories
144
+ */
145
+ public static final String HIVE_DATASET_PARTITION_OVERWRITE = "hive.dataset.partition.overwrite" ;
146
+ public static final boolean DEFAULT_HIVE_DATASET_PARTITION_OVERWRITE = true ;
147
+
138
148
/**
139
149
* If set to true, a set format DDL will be separate from add partition DDL
140
150
*/
@@ -459,31 +469,35 @@ public Iterable<QueryBasedHiveConversionEntity> convertRecord(Schema outputAvroS
459
469
// Step:
460
470
// A.2.3, B.2.3: If partitioned table, move partitions from staging to final table; for all partitions:
461
471
462
- // Step:
463
- // A.2.3.1, B.2.3.1: Drop if exists partition in final table
464
- List <String > dropPartitionsDDL =
465
- HiveAvroORCQueryGenerator .generateDropPartitionsDDL (orcTableDatabase ,
466
- orcTableName ,
467
- partitionsDMLInfo );
468
- log .debug ("Drop partitions if exist in final table: " + dropPartitionsDDL );
469
- publishQueries .addAll (dropPartitionsDDL );
470
-
471
472
// Step:
472
473
// A.2.3.2, B.2.3.2: Move partition directory
473
474
// Move: orcStagingDataPartitionLocation to: orcFinalDataPartitionLocation
474
475
String orcFinalDataPartitionLocation = orcDataLocation + Path .SEPARATOR + orcStagingDataPartitionDirName ;
475
- log .info ("Partition directory to move: " + orcStagingDataPartitionLocation + " to: " + orcFinalDataPartitionLocation );
476
+ Optional <Path > destPartitionLocation = getDestinationPartitionLocation (destinationTableMeta , workUnit ,
477
+ conversionEntity .getHivePartition ().get ().getName ());
478
+ orcFinalDataPartitionLocation =
479
+ updatePartitionLocation (orcFinalDataPartitionLocation , workUnit , destPartitionLocation );
480
+ log .info (
481
+ "Partition directory to move: " + orcStagingDataPartitionLocation + " to: " + orcFinalDataPartitionLocation );
476
482
publishDirectories .put (orcStagingDataPartitionLocation , orcFinalDataPartitionLocation );
483
+ // Step:
484
+ // A.2.3.1, B.2.3.1: Drop if exists partition in final table
477
485
478
486
// Step:
487
+ // If destination partition already exists, alter the partition location
479
488
// A.2.3.3, B.2.3.3: Create partition with location (and update storage format if not in ORC already)
480
- String orcDataPartitionLocation = orcDataLocation + Path .SEPARATOR + orcStagingDataPartitionDirName ;
489
+ List <String > dropPartitionsDDL =
490
+ HiveAvroORCQueryGenerator .generateDropPartitionsDDL (orcTableDatabase ,
491
+ orcTableName ,
492
+ partitionsDMLInfo );
493
+ log .debug ("Drop partitions if exist in final table: " + dropPartitionsDDL );
494
+ publishQueries .addAll (dropPartitionsDDL );
481
495
if (workUnit .getPropAsBoolean (HIVE_CONVERSION_SETSERDETOAVROEXPLICITELY ,
482
496
DEFAULT_HIVE_CONVERSION_SETSERDETOAVROEXPLICITELY )) {
483
497
List <String > createFinalPartitionDDL =
484
498
HiveAvroORCQueryGenerator .generateCreatePartitionDDL (orcTableDatabase ,
485
499
orcTableName ,
486
- orcDataPartitionLocation ,
500
+ orcFinalDataPartitionLocation ,
487
501
partitionsDMLInfo ,
488
502
Optional .<String >absent ());
489
503
@@ -503,7 +517,7 @@ public Iterable<QueryBasedHiveConversionEntity> convertRecord(Schema outputAvroS
503
517
List <String > createFinalPartitionDDL =
504
518
HiveAvroORCQueryGenerator .generateCreatePartitionDDL (orcTableDatabase ,
505
519
orcTableName ,
506
- orcDataPartitionLocation ,
520
+ orcFinalDataPartitionLocation ,
507
521
partitionsDMLInfo ,
508
522
Optional .fromNullable (ORC_FORMAT ));
509
523
@@ -747,4 +761,50 @@ private Pair<Optional<Table>, Optional<List<Partition>>> getDestinationTableMeta
747
761
748
762
return ImmutablePair .of (table , partitions );
749
763
}
764
+
765
+ /**
766
+ * If partition already exists then new partition location will be a separate time stamp dir
767
+ * If partition location is /a/b/c/<oldTimeStamp> then new partition location is /a/b/c/<currentTimeStamp>
768
+ * If partition location is /a/b/c/ then new partition location is /a/b/c/<currentTimeStamp>
769
+ **/
770
+ private String updatePartitionLocation (String orcDataPartitionLocation , WorkUnitState workUnitState ,
771
+ Optional <Path > destPartitionLocation )
772
+ throws DataConversionException {
773
+
774
+ if (workUnitState .getPropAsBoolean (HIVE_DATASET_PARTITION_OVERWRITE , DEFAULT_HIVE_DATASET_PARTITION_OVERWRITE )) {
775
+ return orcDataPartitionLocation ;
776
+ }
777
+ if (!destPartitionLocation .isPresent ()) {
778
+ return orcDataPartitionLocation ;
779
+ }
780
+ long timeStamp = System .currentTimeMillis ();
781
+ return StringUtils .join (Arrays .asList (orcDataPartitionLocation , timeStamp ), '/' );
782
+ }
783
+
784
+ private Optional <Path > getDestinationPartitionLocation (Optional <Table > table , WorkUnitState state ,
785
+ String partitionName )
786
+ throws DataConversionException {
787
+ Optional <org .apache .hadoop .hive .metastore .api .Partition > partitionOptional =
788
+ Optional .<org .apache .hadoop .hive .metastore .api .Partition >absent ();
789
+ if (!table .isPresent ()) {
790
+ return Optional .<Path >absent ();
791
+ }
792
+ try {
793
+ HiveMetastoreClientPool pool = HiveMetastoreClientPool .get (state .getJobState ().getProperties (),
794
+ Optional .fromNullable (state .getJobState ().getProp (HiveDatasetFinder .HIVE_METASTORE_URI_KEY )));
795
+ try (AutoReturnableObject <IMetaStoreClient > client = pool .getClient ()) {
796
+ partitionOptional =
797
+ Optional .of (client .get ().getPartition (table .get ().getDbName (), table .get ().getTableName (), partitionName ));
798
+ }
799
+ if (partitionOptional .isPresent ()) {
800
+ org .apache .hadoop .hive .ql .metadata .Table qlTable = new org .apache .hadoop .hive .ql .metadata .Table (table .get ());
801
+ org .apache .hadoop .hive .ql .metadata .Partition qlPartition =
802
+ new org .apache .hadoop .hive .ql .metadata .Partition (qlTable , partitionOptional .get ());
803
+ return Optional .of (qlPartition .getDataLocation ());
804
+ }
805
+ } catch (IOException | TException | HiveException e ) {
806
+ throw new DataConversionException ("Could not fetch destination table metadata" , e );
807
+ }
808
+ return Optional .<Path >absent ();
809
+ }
750
810
}
0 commit comments