Skip to content

Insert into bucketed but unpartitioned Hive table #25139

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -476,9 +476,6 @@ private WriterParameters getWriterParametersForExistingUnpartitionedTable(Option
{
// Note: temporary table is always empty at this step
if (!table.getTableType().equals(TEMPORARY_TABLE)) {
if (bucketNumber.isPresent()) {
throw new PrestoException(HIVE_PARTITION_READ_ONLY, "Cannot insert into bucketed unpartitioned Hive table");
}
if (immutablePartitions) {
throw new PrestoException(HIVE_PARTITION_READ_ONLY, "Unpartitioned Hive tables are immutable");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -863,6 +863,117 @@ public void testCreateTableNonSupportedVarcharColumn()
assertUpdate("CREATE TABLE test_create_table_non_supported_varchar_column (apple varchar(65536))");
}

@Test
public void testEmptyBucketedTable()
{
// go through all storage formats to make sure the empty buckets are correctly created
testWithAllStorageFormats(this::testEmptyBucketedTable);
}

private void testEmptyBucketedTable(Session session, HiveStorageFormat storageFormat)
{
testEmptyBucketedTable(session, storageFormat, true, true);
testEmptyBucketedTable(session, storageFormat, true, false);
testEmptyBucketedTable(session, storageFormat, false, true);
testEmptyBucketedTable(session, storageFormat, false, false);
}

private void testEmptyBucketedTable(Session session, HiveStorageFormat storageFormat, boolean optimizedPartitionUpdateSerializationEnabled, boolean createEmpty)
{
String tableName = "test_empty_bucketed_table";

@Language("SQL") String createTable = "" +
"CREATE TABLE " + tableName + " " +
"(bucket_key VARCHAR, col_1 VARCHAR, col2 VARCHAR) " +
"WITH (" +
"format = '" + storageFormat + "', " +
"bucketed_by = ARRAY[ 'bucket_key' ], " +
"bucket_count = 11 " +
") ";

assertUpdate(createTable);

TableMetadata tableMetadata = getTableMetadata(catalog, TPCH_SCHEMA, tableName);
assertEquals(tableMetadata.getMetadata().getProperties().get(STORAGE_FORMAT_PROPERTY), storageFormat);

assertNull(tableMetadata.getMetadata().getProperties().get(PARTITIONED_BY_PROPERTY));
assertEquals(tableMetadata.getMetadata().getProperties().get(BUCKETED_BY_PROPERTY), ImmutableList.of("bucket_key"));
assertEquals(tableMetadata.getMetadata().getProperties().get(BUCKET_COUNT_PROPERTY), 11);

assertEquals(computeActual("SELECT * from " + tableName).getRowCount(), 0);

// make sure that we will get one file per bucket regardless of writer count configured
Session parallelWriter = Session.builder(getTableWriteTestingSession(optimizedPartitionUpdateSerializationEnabled))
.setCatalogSessionProperty(catalog, "create_empty_bucket_files", String.valueOf(createEmpty))
.build();
assertUpdate(parallelWriter, "INSERT INTO " + tableName + " VALUES ('a0', 'b0', 'c0')", 1);
assertUpdate(parallelWriter, "INSERT INTO " + tableName + " VALUES ('a1', 'b1', 'c1')", 1);

assertQuery("SELECT * from " + tableName, "VALUES ('a0', 'b0', 'c0'), ('a1', 'b1', 'c1')");

assertUpdate(session, "DROP TABLE " + tableName);
assertFalse(getQueryRunner().tableExists(session, tableName));
}

@Test
public void testBucketedTable()
{
// go through all storage formats to make sure the empty buckets are correctly created
testWithAllStorageFormats(this::testBucketedTable);
}

private void testBucketedTable(Session session, HiveStorageFormat storageFormat)
{
testBucketedTable(session, storageFormat, true, true);
testBucketedTable(session, storageFormat, true, false);
testBucketedTable(session, storageFormat, false, true);
testBucketedTable(session, storageFormat, false, false);
}

private void testBucketedTable(Session session, HiveStorageFormat storageFormat, boolean optimizedPartitionUpdateSerializationEnabled, boolean createEmpty)
{
String tableName = "test_bucketed_table";

@Language("SQL") String createTable = "" +
"CREATE TABLE " + tableName + " " +
"WITH (" +
"format = '" + storageFormat + "', " +
"bucketed_by = ARRAY[ 'bucket_key' ], " +
"bucket_count = 11 " +
") " +
"AS " +
"SELECT * " +
"FROM (" +
"VALUES " +
" (VARCHAR 'a', VARCHAR 'b', VARCHAR 'c'), " +
" ('aa', 'bb', 'cc'), " +
" ('aaa', 'bbb', 'ccc')" +
") t (bucket_key, col_1, col_2)";

// make sure that we will get one file per bucket regardless of writer count configured
Session parallelWriter = Session.builder(getTableWriteTestingSession(optimizedPartitionUpdateSerializationEnabled))
.setCatalogSessionProperty(catalog, "create_empty_bucket_files", String.valueOf(createEmpty))
.build();
assertUpdate(parallelWriter, createTable, 3);

TableMetadata tableMetadata = getTableMetadata(catalog, TPCH_SCHEMA, tableName);
assertEquals(tableMetadata.getMetadata().getProperties().get(STORAGE_FORMAT_PROPERTY), storageFormat);

assertNull(tableMetadata.getMetadata().getProperties().get(PARTITIONED_BY_PROPERTY));
assertEquals(tableMetadata.getMetadata().getProperties().get(BUCKETED_BY_PROPERTY), ImmutableList.of("bucket_key"));
assertEquals(tableMetadata.getMetadata().getProperties().get(BUCKET_COUNT_PROPERTY), 11);

assertQuery("SELECT * from " + tableName, "VALUES ('a', 'b', 'c'), ('aa', 'bb', 'cc'), ('aaa', 'bbb', 'ccc')");

assertUpdate(parallelWriter, "INSERT INTO " + tableName + " VALUES ('a0', 'b0', 'c0')", 1);
assertUpdate(parallelWriter, "INSERT INTO " + tableName + " VALUES ('a1', 'b1', 'c1')", 1);

assertQuery("SELECT * from " + tableName, "VALUES ('a', 'b', 'c'), ('aa', 'bb', 'cc'), ('aaa', 'bbb', 'ccc'), ('a0', 'b0', 'c0'), ('a1', 'b1', 'c1')");

assertUpdate(session, "DROP TABLE " + tableName);
assertFalse(getQueryRunner().tableExists(session, tableName));
}

@Test
public void testCreatePartitionedBucketedTableAsFewRows()
{
Expand Down
5 changes: 5 additions & 0 deletions presto-native-tests/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,11 @@
<version>${project.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>io.prestodb.tempto</groupId>
<artifactId>tempto-core</artifactId>
<scope>test</scope>
</dependency>
</dependencies>

<build>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.nativetests;

import com.facebook.presto.testing.QueryRunner;
import com.facebook.presto.tests.AbstractTestQueryFramework;
import org.intellij.lang.annotations.Language;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;

import static io.prestodb.tempto.fulfillment.table.hive.tpch.TpchTableDefinitions.NATION;
import static java.lang.Boolean.parseBoolean;
import static org.testng.Assert.assertEquals;

public class TestHivePartitionedInsertNative
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we move these testcases to presto-tests or presto-product-tests? Ideally, we don't want to add new testcases to presto-native-tests, instead we should just extend the existing e2e tests (such as the ones added to presto-product-tests in this PR) to run with with the native query runner.

extends AbstractTestQueryFramework
{
private String storageFormat;
private boolean sidecarEnabled;
QueryRunner queryRunner;

@BeforeClass
@Override
public void init() throws Exception
{
storageFormat = System.getProperty("storageFormat", "PARQUET");
sidecarEnabled = parseBoolean(System.getProperty("sidecarEnabled", "true"));
super.init();
}

@Override
protected QueryRunner createQueryRunner() throws Exception
{
queryRunner = NativeTestsUtils.createNativeQueryRunner(storageFormat, sidecarEnabled);
return queryRunner;
}

@Override
protected void createTables()
{
NativeTestsUtils.createTables(storageFormat);
}


@Test
public void testInsertIntoBucketedTables()
{
String tableName = "hive.tpch.bucketed_nation";

// // Clean up previous run
queryRunner.execute("DROP TABLE IF EXISTS " + tableName);
//
// // Create the bucketed table
@Language("SQL") String createTableSql = "CREATE TABLE " + tableName + " (\n" +
" n_nationkey BIGINT,\n" +
" n_name VARCHAR,\n" +
" n_regionkey BIGINT,\n" +
" n_comment VARCHAR\n" +
")\n" +
"WITH (\n" +
" format = 'PARQUET',\n" +
" bucketed_by = ARRAY['n_regionkey'],\n" +
" bucket_count = 2\n" +
")";
queryRunner.execute(createTableSql);

// Insert data twice
queryRunner.execute("INSERT INTO "+ tableName + " SELECT * FROM "+NATION.getName());
queryRunner.execute("INSERT INTO "+ tableName + " SELECT * FROM "+NATION.getName());
// Validate total row count
assertEquals(queryRunner.execute(queryRunner.getDefaultSession(), "SELECT count(*) FROM " + tableName).toString(), "MaterializedResult{rows=[[50]], types=[bigint], setSessionProperties={}, resetSessionProperties=[]}");
// Validate filtered row count
assertEquals(queryRunner.execute(queryRunner.getDefaultSession(), "SELECT count(*) FROM " + tableName + " WHERE n_regionkey = 0").toString(), "MaterializedResult{rows=[[10]], types=[bigint], setSessionProperties={}, resetSessionProperties=[]}");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -278,13 +278,18 @@ public void testInsertBucketed()
assertThat(statisticsAfterCreate.getNumRows().getAsLong()).isEqualTo(25);
assertThat(statisticsAfterCreate.getNumFiles().getAsLong()).isEqualTo(50);

// Insert into bucketed unpartitioned table is unsupported
assertThatThrownBy(() -> insertNationData(onPresto(), tableName))
.hasMessageContaining("Cannot insert into bucketed unpartitioned Hive table");
insertNationData(onPresto(), tableName);

BasicStatistics statisticsAfterInsert = getBasicStatisticsForTable(onHive(), tableName);
assertThat(statisticsAfterInsert.getNumRows().getAsLong()).isEqualTo(25);
assertThat(statisticsAfterCreate.getNumFiles().getAsLong()).isEqualTo(50);

assertThat(statisticsAfterInsert.getNumRows().getAsLong()).isEqualTo(50);
assertThat(statisticsAfterInsert.getNumFiles().getAsLong()).isEqualTo(100);

insertNationData(onPresto(), tableName);

BasicStatistics statisticsAfterInsert2 = getBasicStatisticsForTable(onHive(), tableName);
assertThat(statisticsAfterInsert2.getNumRows().getAsLong()).isEqualTo(75);
assertThat(statisticsAfterInsert2.getNumFiles().getAsLong()).isEqualTo(150);
}
finally {
onPresto().executeQuery(format("DROP TABLE IF EXISTS %s", tableName));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,35 +44,35 @@ public class TestHiveBucketedTables
implements RequirementsProvider
{
@TableDefinitionsRepository.RepositoryTableDefinition
public static final HiveTableDefinition BUCKETED_PARTITIONED_NATION = HiveTableDefinition.builder("bucket_partition_nation")
.setCreateTableDDLTemplate("CREATE TABLE %NAME%(" +
"n_nationkey BIGINT," +
"n_name STRING," +
"n_regionkey BIGINT," +
"n_comment STRING) " +
"PARTITIONED BY (part_key STRING) " +
"CLUSTERED BY (n_regionkey) " +
"INTO 2 BUCKETS " +
"ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'")
.setNoData()
.build();
public static final HiveTableDefinition BUCKETED_NATION = bucketTableDefinition("bucket_nation", false, true);

@TableDefinitionsRepository.RepositoryTableDefinition
public static final HiveTableDefinition PARTITIONED_NATION = HiveTableDefinition.builder("partitioned_nation")
.setCreateTableDDLTemplate("CREATE TABLE %NAME%(" +
"n_nationkey BIGINT," +
"n_name STRING," +
"n_regionkey BIGINT," +
"n_comment STRING) " +
"PARTITIONED BY (part_key STRING) " +
"ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'")
.setNoData()
.build();
public static final HiveTableDefinition BUCKETED_PARTITIONED_NATION = bucketTableDefinition("bucket_partitioned_nation", true, true);

@TableDefinitionsRepository.RepositoryTableDefinition
public static final HiveTableDefinition PARTITIONED_NATION = bucketTableDefinition("partitioned_nation", true, false);

private static HiveTableDefinition bucketTableDefinition(String tableName, boolean partitioned, boolean bucketed)
{
return HiveTableDefinition.builder(tableName)
.setCreateTableDDLTemplate("CREATE TABLE %NAME%(" +
"n_nationkey BIGINT," +
"n_name STRING," +
"n_regionkey BIGINT," +
"n_comment STRING) " +
(partitioned ? "PARTITIONED BY (part_key STRING) " : " ") +
"CLUSTERED BY (n_regionkey) " +
(bucketed ? "INTO 2 BUCKETS " : " ") +
"ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'")
.setNoData()
.build();
}

@Override
public Requirement getRequirements(Configuration configuration)
{
return Requirements.compose(
MutableTableRequirement.builder(BUCKETED_NATION).withState(CREATED).build(),
MutableTableRequirement.builder(BUCKETED_PARTITIONED_NATION).withState(CREATED).build(),
immutableTable(NATION));
}
Expand Down Expand Up @@ -167,4 +167,17 @@ private static void disableBucketedExecution()
throw new RuntimeException(e);
}
}

@Test
public void testInsertIntoBucketedTables()
{
String tableName = mutableTablesState().get(BUCKETED_NATION).getNameInDatabase();

query(format("INSERT INTO %s SELECT * FROM %s", tableName, NATION.getName()));
// make sure that insert will not overwrite existing data
query(format("INSERT INTO %s SELECT * FROM %s", tableName, NATION.getName()));

assertThat(query(format("SELECT count(*) FROM %s", tableName))).containsExactly(row(50));
assertThat(query(format("SELECT count(*) FROM %s WHERE n_regionkey=0", tableName))).containsExactly(row(10));
}
}
Loading