/** * It initializes hoodie dataset * @param fs {@link FileSystem} * @param hoodieConf {@link HoodieConfiguration} * @throws IOException */ public static void initHoodieDataset(@NonNull final FileSystem fs, @NonNull final HoodieConfiguration hoodieConf) throws IOException { final Path hoodieMetaFolder = new Path(hoodieConf.getBasePath(), HoodieTableMetaClient.METAFOLDER_NAME); final Path hoodiePropertiesFile = new Path(hoodieMetaFolder.toString(), HoodieTableConfig.HOODIE_PROPERTIES_FILE); if (!fs.exists(hoodiePropertiesFile)) { HoodieTableMetaClient .initializePathAsHoodieDataset(FSUtils.getFs(hoodieConf.getConf()), hoodieConf.getBasePath(), hoodieConf.getHoodieInitProperties()); } }
@VisibleForTesting protected int calculateNewBulkInsertParallelism(final long numRecords) { final long avgRecordSize = this.sinkStatMgr.getAvgRecordSize(); final long targetFileSize = this.hoodieConf.getTargetFileSize(); final int newParallelism = (int) Math.ceil((numRecords * avgRecordSize * 1.0) / Math.max(1, targetFileSize)); final int currentParallelism = this.hoodieConf.getBulkInsertParallelism(); log.info( "StatsManager:targetFileSize:{}:avgRecordSize:{}:numRecords:{}:" + "newBulkInsertParallelism:{}:currentBulkInsertParallelism:{}", targetFileSize, avgRecordSize, numRecords, newParallelism, currentParallelism); return newParallelism; }
/** * It will read property value from table and default namespace. Value will be returned in following order. * For example for propertyKey ("common.hoodie.%s.insert_split_size") * 1) table specific value ("common.hoodie.tables.table1.insert_split_size" defined in {@link Configuration}) * 2) default hoodie property value ("common.hoodie.default.insert_split_size" defined in {@link Configuration}) * 3) default value specified. (passed in as an argument). * * @param propertyKey hoodie property key * @param defaultValue default value of the property * @param <T> DataType of the property */ public <T> T getProperty(@NotEmpty final String propertyKey, @NonNull final T defaultValue) { final String defaultKey = getDefaultPropertyKey(propertyKey); final String tableKey = getTablePropertyKey(propertyKey, this.tableKey); final T retValue = Configuration.getProperty(this.conf, defaultKey, defaultValue); return Configuration.getProperty(this.conf, tableKey, retValue); }
/** * @return hoodie table name. */ public String getTableName() { return this.getConf().getProperty(getTablePropertyKey(HOODIE_TABLE_NAME, this.tableKey)).get(); }
final HoodieWriteConfig.Builder builder = HoodieWriteConfig.newBuilder(); try { builder.forTable(getTableName()); builder.withPath(getBasePath()); final boolean combineBeforeInsert = getProperty(HOODIE_COMBINE_BEFORE_INSERT, DEFAULT_HOODIE_COMBINE_BEFORE_INSERT); final boolean combineBeforeUpsert = getProperty(HOODIE_COMBINE_BEFORE_UPSERT, DEFAULT_HOODIE_COMBINE_BEFORE_UPSERT); builder.combineInput(combineBeforeInsert, combineBeforeUpsert); final String schemaPropertyKey = getTablePropertyKey(HOODIE_AVRO_SCHEMA, this.tableKey); final Optional<String> schema = this.conf.getProperty(schemaPropertyKey); if (!schema.isPresent()) { builder.withParallelism(this.getInsertParallelism(), this.getUpsertParallelism()) .withBulkInsertParallelism(this.getBulkInsertParallelism()); builder.withAutoCommit(false); .valueOf(getProperty(HOODIE_CLEANER_POLICY, DEFAULT_HOODIE_CLEANER_POLICY))); compactionConfigBuilder.retainCommits( getProperty(HOODIE_CLEANER_COMMITS_RETAINED, DEFAULT_HOODIE_CLEANER_COMMITS_RETAINED)); compactionConfigBuilder.retainFileVersions( getProperty(HOODIE_CLEANER_VERSIONS_RETAINED, DEFAULT_HOODIE_CLEANER_VERSIONS_RETAINED)); final Integer insertSplitSize = getProperty(HOODIE_INSERT_SPLIT_SIZE, -1); if (insertSplitSize > 0) { compactionConfigBuilder.autoTuneInsertSplits(false); getProperty(HOODIE_COMPACTION_SMALL_FILE_SIZE_LIMIT, DEFAULT_HOODIE_COMPACTION_SMALL_FILE_SIZE_LIMIT)); compactionConfigBuilder.withAutoClean(shouldAutoClean());
private static <T> void verifyProperty(@NotEmpty final String tableName, @NonNull final HoodieConfiguration hoodieConf, @NonNull final T defaultValue, @NonNull final T defaultPropertyValue, @NonNull final T tableValue) { Object value = hoodieConf.getProperty(HoodieConfiguration.HOODIE_INSERT_SPLIT_SIZE, defaultValue); Assert.assertTrue(value.equals(defaultValue) && value.getClass() == defaultValue.getClass()); hoodieConf.getConf().setProperty( HoodieConfiguration.getDefaultPropertyKey(HoodieConfiguration.HOODIE_INSERT_SPLIT_SIZE), defaultPropertyValue.toString()); value = hoodieConf.getProperty(HoodieConfiguration.HOODIE_INSERT_SPLIT_SIZE, defaultValue); Assert.assertTrue( value.equals(defaultPropertyValue) && value.getClass() == defaultPropertyValue.getClass()); hoodieConf.getConf().setProperty( HoodieConfiguration.getTablePropertyKey(HoodieConfiguration.HOODIE_INSERT_SPLIT_SIZE, tableName), tableValue.toString()); value = hoodieConf.getProperty(HoodieConfiguration.HOODIE_INSERT_SPLIT_SIZE, defaultValue); Assert.assertTrue( value.equals(tableValue) && value.getClass() == tableValue.getClass()); } }
@Test public void testErrorTableConfiguration() { final String hoodieTableNameKey = HoodieConfiguration.getTablePropertyKey(HoodieConfiguration.HOODIE_TABLE_NAME, TARGET_TABLE); final String metricsPrefixKey = HoodieConfiguration.getTablePropertyKey(HoodieConfiguration.HOODIE_METRICS_PREFIX, TARGET_TABLE); final String base = HoodieConfiguration.getTablePropertyKey(HoodieConfiguration.HOODIE_BASE_PATH, TARGET_TABLE); Configuration conf = new Configuration(); conf.setProperty(hoodieTableNameKey, tableName); conf.setProperty(metricsPrefixKey, metricsPrefix); conf.setProperty(ErrorTableConfiguration.IS_ENABLED, "true"); conf.setProperty(ErrorTableConfiguration.DESTINATION_PATH, errorBasePath); conf.setProperty(base, basePath); HoodieConfiguration hoodieConfiguration = createHoodieConfiguration(conf); Assert.assertEquals(hoodieConfiguration.getBasePath(), errorBasePath); Assert.assertEquals(hoodieConfiguration.getTableName(), errorTableName); Assert.assertEquals(hoodieConfiguration.getHoodieMetricsPrefix(), errorMetricsPrefix); }
@Test public void testUpdateBulkInsertParallelism() { final String basePath = "/basePath"; final String tableName = "test-table"; final String schemaStr = getSchema("TS", "RECORD_KEY", 4, 8).toString(); final HoodieConfiguration hoodieConf = HoodieConfiguration.newBuilder(tableName).withTableName(tableName).withMetricsPrefix("test") .withBasePath(basePath).withSchema(schemaStr).enableMetrics(false).build(); final HoodieSink mockSink = spy(new HoodieSink(hoodieConf, mock(HoodieSinkDataConverter.class), mock(JavaSparkContext.class), HoodieSink.HoodieSinkOp.NO_OP, new NoOpMetadataManager())); when(mockSink.calculateNewBulkInsertParallelism(anyLong())).thenReturn(18); Assert.assertTrue(mockSink.updateBulkInsertParallelism(1000)); Assert.assertEquals(18, hoodieConf.getBulkInsertParallelism()); Assert.assertEquals(HoodieConfiguration.DEFAULT_HOODIE_PARALLELISM, hoodieConf.getInsertParallelism()); }
/** * @return hoodie configuration. */ public HoodieConfiguration getHoodieConfiguration(@NonNull final Configuration conf, @NotEmpty final String schema, @NotEmpty final String tableKey, @NotEmpty final String errorTableKey, final boolean errorMetricsEnabled) { final HoodieConfiguration hoodieConf = new HoodieConfiguration(conf, tableKey); final String errorTableName = getErrorTableName(hoodieConf.getTableName()); final HoodieConfiguration.Builder builder = HoodieConfiguration.newBuilder(conf, errorTableKey) .withSchema(schema) .withTableName(errorTableName) .withBasePath(this.getDestPath().toString()) .withBulkInsertParallelism(this.getWriteParallelism()) .enableMetrics(errorMetricsEnabled) .withWriteStatusClass(HoodieWriteStatus.class); // TODO T1793431 fix error metrics and enable metrics if (errorMetricsEnabled) { final String errorMetricsPrefix = getErrorMetricsPrefix(hoodieConf.getHoodieMetricsPrefix()); builder.withMetricsPrefix(errorMetricsPrefix); } return builder.build(); }
/** * @return List of mandatory properties. */ public List<String> getMandatoryProperties() { return Collections.unmodifiableList(Arrays.asList(getTablePropertyKey(HOODIE_TABLE_NAME, this.tableKey), getTablePropertyKey(HOODIE_BASE_PATH, this.tableKey))); }
public static Builder newBuilder(@NotEmpty final String tableKey) { return newBuilder(new Configuration(), tableKey); }
/** * @return true if {@link com.uber.hoodie.HoodieWriteClient} should rollback inflight commits from previous write * call. */ public boolean shouldRollbackInFlight() { return getProperty(HOODIE_ROLLBACK_INFLIGHT_COMMITS, DEFAULT_HOODIE_ROLLBACK_INFLIGHT_COMMITS); }
public void writeRecordsAndErrors(@NonNull final HoodieWriteResult result, final boolean isErrorTableEnabled) { try { if (result.getException().isPresent()) { throw result.getException().get(); } if (result.getWriteStatuses().isPresent()) { if (isErrorTableEnabled) { // TODO: Can we make this more readable, please? final JavaRDD<Tuple2<HoodieRecord, String>> hoodieRecordAndErrorTupleRDD = result.getWriteStatuses().get() .flatMap(ws -> ws.getFailedRecords().stream().map(fr -> new Tuple2<>(fr, ws.getErrors().get(fr.getKey()).getMessage())).iterator()); final JavaRDD<ErrorData> errorRDD = hoodieRecordAndErrorTupleRDD .map(r -> new ErrorData(r._2, RawDataHelper.getRawData(r._1))); ErrorTableUtil.writeErrorRecordsToErrorTable(this.jsc.sc(), this.hoodieConf.getConf(), Optional.of(this.hoodieConf.getTableName()), new RDDWrapper<>(errorRDD), new HoodieSinkErrorExtractor()); } } } catch (HoodieInsertException | HoodieUpsertException e) { log.error("Error writing to hoodie", e); throw new JobRuntimeException("hoodie write failed :" + (result.getWriteStatuses().isPresent() ? result.getWriteStatuses().get().count() : -1), e); } catch (Exception e) { throw new JobRuntimeException("Error writing to hoodie", e); } }
private static Map<String, String> readMetadataInfo( @NonNull final HoodieConfiguration hoodieConf) { try { final FileSystem fs = FSUtils.getFs(hoodieConf.getConf()); HoodieUtil.initHoodieDataset(fs, hoodieConf); final HoodieTableMetaClient hoodieTableMetaClient = new HoodieTableMetaClient(new HadoopConfiguration(hoodieConf.getConf()).getHadoopConf(), hoodieConf.getBasePath(), true); final HoodieActiveTimeline hoodieActiveTimeline = hoodieTableMetaClient.getActiveTimeline(); final java.util.Optional<HoodieInstant> lastInstant = hoodieActiveTimeline.getCommitTimeline() .filterCompletedInstants().lastInstant(); if (lastInstant.isPresent()) { log.info("using hoodie instant for reading checkpoint info :{}", lastInstant.get().getTimestamp()); final HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(hoodieActiveTimeline.getInstantDetails(lastInstant.get()).get()); final String serCommitInfo = commitMetadata.getMetadata(HOODIE_METADATA_KEY); if (!Strings.isNullOrEmpty(serCommitInfo)) { return MapUtil.deserializeMap(serCommitInfo); } } return new HashMap<>(); } catch (IOException e) { log.error("failed to read metadata info", e); throw new JobRuntimeException("failed to read metadata information", e); } } }
final HoodieConfiguration hoodieConf = HoodieConfiguration.newBuilder(conf, tableName).withTableName(tableName) .withBasePath(basePath.toString()).withSchema(schemaStr).withMetricsPrefix("hoodieMetricsPrefix") .enableMetrics(false).build(); Assert.assertTrue( new HoodieTableMetaClient( new HadoopConfiguration(hoodieConf.getConf()).getHadoopConf(), basePath.toString(), true) .getActiveTimeline().getCommitTimeline().filterCompletedInstants().empty()); Assert.assertFalse( new HoodieTableMetaClient( new HadoopConfiguration(hoodieConf.getConf()).getHadoopConf(), basePath.toString(), true) .getActiveTimeline().getCommitTimeline().filterCompletedInstants().empty());
/** * @return returns hoodie properties */ public Properties getHoodieInitProperties() { final Properties props = new Properties(); props.put(HoodieTableConfig.HOODIE_TABLE_NAME_PROP_NAME, this.getTableName()); return props; }
/** * Ensure that hoodie dataset is present. */ protected void initDataset() { try { HoodieUtil.initHoodieDataset(FSUtils.getFs(this.hoodieConf.getConf()), this.hoodieConf); } catch (IOException e) { log.error("Error initializing hoodie dataset.", e); throw new JobRuntimeException("Could not initialize hoodie dataset", e); } }
@Test public void testUpdateInsertParallelism() { final String basePath = "/basePath"; final String tableName = "test-table"; final String schemaStr = getSchema("TS", "RECORD_KEY", 4, 8).toString(); final HoodieConfiguration hoodieConf = HoodieConfiguration.newBuilder(tableName).withTableName(tableName).withMetricsPrefix("test") .withBasePath(basePath).withSchema(schemaStr).enableMetrics(false).build(); final HoodieSink mockSink = spy(new HoodieSink(hoodieConf, mock(HoodieSinkDataConverter.class), mock(JavaSparkContext.class), HoodieSink.HoodieSinkOp.NO_OP, new NoOpMetadataManager())); when(mockSink.calculateNewBulkInsertParallelism(anyLong())).thenReturn(18); Assert.assertTrue(mockSink.updateInsertParallelism(1000)); Assert.assertEquals(18, hoodieConf.getInsertParallelism()); Assert.assertEquals(HoodieConfiguration.DEFAULT_HOODIE_PARALLELISM, hoodieConf.getBulkInsertParallelism()); }
public Builder withInsertParallelism(final int parallelism) { this.conf.setProperty( getTablePropertyKey(HOODIE_INSERT_PARALLELISM, this.tableKey), Integer.toString(parallelism)); return this; }
/** * @return hoodie metrics prefix. * */ public String getHoodieMetricsPrefix() { return this.getConf().getProperty(getTablePropertyKey(HOODIE_METRICS_PREFIX, this.tableKey)).get(); }