@Override public QueryBasedHivePublishEntity generatePublishQueries() throws Exception { return queryGenerator.generatePublishQueries(); } }
@Override public List<String> generateHiveQueries() { return queryGenerator.generateQueries(); }
public HiveMaterializerQueryGenerator(WorkUnitState workUnitState) throws IOException { this.fs = HiveSource.getSourceFs(workUnitState); this.pool = HiveMetastoreClientPool.get(workUnitState.getJobState().getProperties(), Optional.fromNullable(workUnitState.getJobState().getProp(HiveDatasetFinder.HIVE_METASTORE_URI_KEY))); this.workUnitState = workUnitState; this.workUnit = new HiveWorkUnit(workUnitState.getWorkunit()); this.outputTableMetadata = HiveMaterializer.parseStageableTableMetadata(this.workUnit); this.outputDatabaseName = outputTableMetadata.getDestinationDbName(); this.outputTableName = outputTableMetadata.getDestinationTableName(); this.outputDataLocation = HiveConverterUtils.getOutputDataLocation(outputTableMetadata.getDestinationDataPath()); this.destinationTableMeta = HiveConverterUtils.getDestinationTableMeta(this.outputTableMetadata.getDestinationDbName(), this.outputTableMetadata.getDestinationTableName(), workUnitState.getProperties()).getLeft(); this.stagingTableName = HiveConverterUtils.getStagingTableName(this.outputTableMetadata.getDestinationStagingTableName()); this.stagingDataLocation = HiveConverterUtils.getStagingDataLocation(this.outputTableMetadata.getDestinationDataPath(), this.stagingTableName); }
/** * Create a work unit to materialize a query to a target table using a staging table in between. * @param query the query to materialize. * @param storageFormat format in which target table should be written. * @param destinationTable {@link StageableTableMetadata} specifying staging and target tables metadata. */ public static WorkUnit queryResultMaterializationWorkUnit(String query, HiveConverterUtils.StorageFormat storageFormat, StageableTableMetadata destinationTable) { WorkUnit workUnit = new WorkUnit(); workUnit.setProp(MATERIALIZER_MODE_KEY, MaterializerMode.QUERY_RESULT_MATERIALIZATION.name()); workUnit.setProp(STORAGE_FORMAT_KEY, storageFormat.name()); workUnit.setProp(QUERY_RESULT_TO_MATERIALIZE_KEY, query); workUnit.setProp(STAGEABLE_TABLE_METADATA_KEY, HiveSource.GENERICS_AWARE_GSON.toJson(destinationTable)); TaskUtils.setTaskFactoryClass(workUnit, HiveMaterializerTaskFactory.class); HiveTask.disableHiveWatermarker(workUnit); return workUnit; }
@Override public List<WorkUnit> getWorkunits(SourceState state) { try { FileSystem fs = HadoopUtils.getSourceFileSystem(state); Config config = ConfigUtils.propertiesToConfig(state.getProperties()); if (state.contains(COPY_TABLE_KEY)) { HiveDataset dataset = getHiveDataset(state.getProp(COPY_TABLE_KEY), fs, state); WorkUnit workUnit = HiveMaterializer.tableCopyWorkUnit(dataset, new StageableTableMetadata(config.getConfig(HIVE_MATERIALIZER_SOURCE_PREFIX), dataset.getTable()), null); HiveTask.disableHiveWatermarker(workUnit); return Lists.newArrayList(workUnit); } else if (state.contains(MATERIALIZE_VIEW)) { HiveDataset dataset = getHiveDataset(state.getProp(MATERIALIZE_VIEW), fs, state); WorkUnit workUnit = HiveMaterializer.viewMaterializationWorkUnit(dataset, getOutputStorageFormat(state), new StageableTableMetadata(config.getConfig(HIVE_MATERIALIZER_SOURCE_PREFIX), dataset.getTable()), null); HiveTask.disableHiveWatermarker(workUnit); return Lists.newArrayList(workUnit); } else if (state.contains(MATERIALIZE_QUERY)) { String query = state.getProp(MATERIALIZE_QUERY); WorkUnit workUnit = HiveMaterializer.queryResultMaterializationWorkUnit(query, getOutputStorageFormat(state), new StageableTableMetadata(config.getConfig(HIVE_MATERIALIZER_SOURCE_PREFIX), null)); HiveTask.disableHiveWatermarker(workUnit); return Lists.newArrayList(workUnit); } } catch (IOException ioe) { throw new RuntimeException(ioe); } throw new RuntimeException(String.format("Must specify either %s, %s, or %s.", COPY_TABLE_KEY, MATERIALIZE_QUERY, MATERIALIZE_VIEW)); }
public MaterializeTableQueryGenerator(WorkUnitState workUnitState) throws IOException { super(workUnitState, false); this.storageFormat = HiveConverterUtils.StorageFormat.valueOf(workUnitState.getProp(HiveMaterializer.STORAGE_FORMAT_KEY)); }
@Override public void commit() { try { executePublishQueries(generatePublishQueries()); super.commit(); } catch (Exception e) { this.workingState = WorkUnitState.WorkingState.FAILED; log.error("Exception in HiveTask generate publish HiveQueries ", e); } } }
public HiveMaterializerFromEntityQueryGenerator(WorkUnitState workUnitState, boolean supportTargetPartitioning) throws IOException { super(workUnitState); try { this.conversionEntity = getConversionEntity(this.workUnit); } catch (TException | HiveException ex) { throw new IOException(ex); } this.sourceTable = this.conversionEntity.getTable(); this.inputDbName = this.sourceTable.getDbName(); this.inputTableName = this.sourceTable.getTableName(); this.sourceDataPathIdentifier = this.outputTableMetadata.getSourceDataPathIdentifier(); this.stagingDataPartitionDirName = HiveConverterUtils.getStagingDataPartitionDirName(conversionEntity, sourceDataPathIdentifier); this.stagingDataPartitionLocation = stagingDataLocation + Path.SEPARATOR + stagingDataPartitionDirName; this.partitionsDDLInfo = Maps.newHashMap(); this.partitionsDMLInfo = Maps.newHashMap(); HiveConverterUtils.populatePartitionInfo(conversionEntity, partitionsDDLInfo, partitionsDMLInfo); this.supportTargetPartitioning = supportTargetPartitioning; }
/** * Create a work unit to materialize a table / view to a target table using a staging table in between. * @param dataset {@link HiveDataset} for the source table. * @param storageFormat format in which target table should be written. * @param destinationTable {@link StageableTableMetadata} specifying staging and target tables metadata. */ public static HiveWorkUnit viewMaterializationWorkUnit(HiveDataset dataset, HiveConverterUtils.StorageFormat storageFormat, StageableTableMetadata destinationTable, @Nullable String partitionName) { HiveWorkUnit workUnit = new HiveWorkUnit(dataset); workUnit.setProp(MATERIALIZER_MODE_KEY, MaterializerMode.TABLE_MATERIALIZATION.name()); workUnit.setProp(STORAGE_FORMAT_KEY, storageFormat.name()); workUnit.setProp(STAGEABLE_TABLE_METADATA_KEY, HiveSource.GENERICS_AWARE_GSON.toJson(destinationTable)); if (!Strings.isNullOrEmpty(partitionName)) { workUnit.setPartitionName(partitionName); } TaskUtils.setTaskFactoryClass(workUnit, HiveMaterializerTaskFactory.class); return workUnit; }
@Test public void copyTableQueryTest() throws Exception { Map<String, String> partitionsDMLInfo = Maps.newHashMap(); String partitionName = "datepartition"; String partitionValue = "2017-07-15-08"; partitionsDMLInfo.put(partitionName, partitionValue); String expectedQuery = "INSERT OVERWRITE TABLE `" + outputDatabaseName + "`.`" + outputTableName + "` \n" + "PARTITION (`" + partitionName + "`) \n" + "SELECT * FROM `" + inputDbName + "`.`" + inputTableName + "` WHERE " + "`" + partitionName + "`='" + partitionsDMLInfo.get(partitionName) + "'"; String actualQuery = HiveConverterUtils.generateTableCopy(inputTableName, outputTableName, inputDbName, outputDatabaseName, Optional.of(partitionsDMLInfo)); Assert.assertEquals(expectedQuery, actualQuery); } }
@Override public List<String> generateQueries() { ensureParentOfStagingPathExists(); return Lists.newArrayList(HiveConverterUtils.generateStagingCTASStatement( new HiveDatasetFinder.DbAndTable(this.outputDatabaseName, this.stagingTableName), this.sourceQuery, this.storageFormat, this.stagingDataLocation)); }
@Override public List<String> generateQueries() { ensureParentOfStagingPathExists(); return Lists.newArrayList(HiveConverterUtils.generateStagingCTASStatementFromSelectStar( new HiveDatasetFinder.DbAndTable(this.outputDatabaseName, this.stagingTableName), new HiveDatasetFinder.DbAndTable(this.inputDbName, this.inputTableName), this.partitionsDMLInfo, this.storageFormat, this.stagingDataLocation)); } }
/** * Generates a CTAS statement to dump the results of a query into a new table. * @param outputDbAndTable output db and table where contents should be written. * @param sourceQuery query to materialize. * @param storageFormat format of output table. * @param outputTableLocation location where files of output table should be written. */ public static String generateStagingCTASStatement(HiveDatasetFinder.DbAndTable outputDbAndTable, String sourceQuery, StorageFormat storageFormat, String outputTableLocation) { Preconditions.checkArgument(!Strings.isNullOrEmpty(outputDbAndTable.getDb()) && !Strings.isNullOrEmpty(outputDbAndTable.getTable()), "Invalid output db and table " + outputDbAndTable); return String.format("CREATE TEMPORARY TABLE `%s`.`%s` STORED AS %s LOCATION '%s' AS %s", outputDbAndTable.getDb(), outputDbAndTable.getTable(), storageFormat.getHiveName(), outputTableLocation, sourceQuery); }
@Override public void run() { try { List<String> queries = generateHiveQueries(); this.hiveJdbcConnector.executeStatements(Lists.transform(this.addFiles, file -> "ADD FILE " + file).toArray(new String[]{})); this.hiveJdbcConnector.executeStatements(Lists.transform(this.addJars, file -> "ADD JAR " + file).toArray(new String[]{})); this.hiveJdbcConnector.executeStatements(this.setupQueries.toArray(new String[]{})); this.hiveJdbcConnector.executeStatements(queries.toArray(new String[queries.size()])); super.run(); } catch (Exception e) { this.workingState = WorkUnitState.WorkingState.FAILED; log.error("Exception in HiveTask generateHiveQueries ", e); } }
dmlQuery.append(partitionKeyValues(optionalPartitionDMLInfo));
HiveConverterUtils.generateCreateDuplicateTableDDL(outputDatabaseName, stagingTableName, outputTableName, outputDataLocation, Optional.of(outputDatabaseName)); publishQueries.add(createFinalTableDDL);
private TaskContext getTaskContextForRun(WorkUnit workUnit) { workUnit.setProp(ConfigurationKeys.JOB_ID_KEY, "job123"); workUnit.setProp(ConfigurationKeys.TASK_ID_KEY, "task123"); workUnit.setProp(HiveConverterUtils.HIVE_DATASET_DESTINATION_SKIP_SETGROUP, Boolean.toString(true)); HiveTask.disableHiveWatermarker(workUnit); JobState jobState = new JobState("job", "job123"); return new TaskContext(new WorkUnitState(workUnit, jobState)); }
public QueryBasedMaterializerQueryGenerator(WorkUnitState workUnitState) throws IOException { super(workUnitState); this.sourceQuery = workUnitState.getProp(HiveMaterializer.QUERY_RESULT_TO_MATERIALIZE_KEY); this.storageFormat = HiveConverterUtils.StorageFormat.valueOf(workUnitState.getProp(HiveMaterializer.STORAGE_FORMAT_KEY)); }
/** * Generates a CTAS statement to dump the contents of a table / partition into a new table. * @param outputDbAndTable output db and table where contents should be written. * @param sourceEntity source table / partition. * @param partitionDMLInfo map of partition values. * @param storageFormat format of output table. * @param outputTableLocation location where files of output table should be written. */ public static String generateStagingCTASStatementFromSelectStar(HiveDatasetFinder.DbAndTable outputDbAndTable, HiveDatasetFinder.DbAndTable sourceEntity, Map<String, String> partitionDMLInfo, StorageFormat storageFormat, String outputTableLocation) { StringBuilder sourceQueryBuilder = new StringBuilder("SELECT * FROM `").append(sourceEntity.getDb()) .append("`.`").append(sourceEntity.getTable()).append("`"); if (partitionDMLInfo != null && !partitionDMLInfo.isEmpty()) { sourceQueryBuilder.append(" WHERE "); sourceQueryBuilder.append(partitionDMLInfo.entrySet().stream() .map(e -> "`" + e.getKey() + "`='" + e.getValue() + "'") .collect(joining(" AND "))); } return generateStagingCTASStatement(outputDbAndTable, sourceQueryBuilder.toString(), storageFormat, outputTableLocation); }