/** * This method calculate data size in megabytes * If total row number > ROW_SAMPLE_THRESHOLD * => It samples data to row number = ROW_SAMPLE_THRESHOLD * => Calculate sample data size by {@link FileSink#getSampleSizeInBytes(JavaRDD)} * => Calculate total data sizes by fraction and change to megabyte * Else calculate total data size by {@link FileSink#getSampleSizeInBytes(JavaRDD)} * * @param data data to calculate size in megabytes * @return estimated data size in megabytes */ protected double getRddSizeInMegaByte(@NonNull final JavaRDD<String> data) { final RDDWrapper<String> dataWrapper = new RDDWrapper<>(data); final long totalRows = dataWrapper.getCount(); final double totalSize; if (totalRows > ROW_SAMPLING_THRESHOLD) { log.debug("Start sampling on Write Data."); final double fraction = (double) ROW_SAMPLING_THRESHOLD / (double) totalRows; log.debug("Sample fraction: {}", fraction); final JavaRDD<String> sampleRdd = data.sample(false, fraction); final long sampleSizeInBytes = getSampleSizeInBytes(sampleRdd); final double sampleSizeInMB = (double) sampleSizeInBytes / FileUtils.ONE_MB; totalSize = sampleSizeInMB / fraction; } else { totalSize = (double) getSampleSizeInBytes(data) / FileUtils.ONE_MB; } return totalSize; }
public void writeRecordsAndErrors(@NonNull final HoodieWriteResult result, final boolean isErrorTableEnabled) { try { if (result.getException().isPresent()) { throw result.getException().get(); } if (result.getWriteStatuses().isPresent()) { if (isErrorTableEnabled) { // TODO: Can we make this more readable, please? final JavaRDD<Tuple2<HoodieRecord, String>> hoodieRecordAndErrorTupleRDD = result.getWriteStatuses().get() .flatMap(ws -> ws.getFailedRecords().stream().map(fr -> new Tuple2<>(fr, ws.getErrors().get(fr.getKey()).getMessage())).iterator()); final JavaRDD<ErrorData> errorRDD = hoodieRecordAndErrorTupleRDD .map(r -> new ErrorData(r._2, RawDataHelper.getRawData(r._1))); ErrorTableUtil.writeErrorRecordsToErrorTable(this.jsc.sc(), this.hoodieConf.getConf(), Optional.of(this.hoodieConf.getTableName()), new RDDWrapper<>(errorRDD), new HoodieSinkErrorExtractor()); } } } catch (HoodieInsertException | HoodieUpsertException e) { log.error("Error writing to hoodie", e); throw new JobRuntimeException("hoodie write failed :" + (result.getWriteStatuses().isPresent() ? result.getWriteStatuses().get().count() : -1), e); } catch (Exception e) { throw new JobRuntimeException("Error writing to hoodie", e); } }
public final RDDWrapper<OD> map(@NonNull final JavaRDD<ID> data) { final ForkOperator<IData> converter = new ForkOperator<>(data.map(r -> RawDataHelper.getRawData(r)), new DataConversionFunction(), this.conf); converter.execute(); // Write error records. ErrorTableUtil.writeErrorRecordsToErrorTable(data.context(), this.conf, Optional.absent(), new RDDWrapper<>(converter.getRDD(ERROR_RECORD).map(r -> (ErrorData) r), converter.getCount(ERROR_RECORD)), errorExtractor); return new RDDWrapper<>(converter.getRDD(VALID_RECORD).map(r -> ((ValidData<OD>) r).getData()), converter.getCount(VALID_RECORD)); }
final Map<String, String> tags = new HashMap<>(); tags.put(SINK_INFO_TAG, this.conf.getSinkType().name()); final RDDWrapper<String> dataWrapper = new RDDWrapper<>(dataToWrite); final long totalRows = dataWrapper.getCount(); this.dataFeedMetrics.get().createLongMetric(DataFeedMetricNames.OUTPUT_ROWCOUNT,
@Test public void testCount() { final List<Integer> list1 = Arrays.asList(1,2,3,4,5); final JavaRDD<Integer> list1RDD = this.jsc.get().parallelize(list1); final RDDWrapper<Integer> rddWrapper1 = new RDDWrapper<Integer>(list1RDD); Assert.assertEquals(5, rddWrapper1.getCount()); Assert.assertEquals(5, rddWrapper1.getData().count()); final RDDWrapper<Integer> rddWrapper2 = new RDDWrapper<Integer>(list1RDD, 1); Assert.assertEquals(1, rddWrapper2.getCount()); Assert.assertEquals(5, rddWrapper2.getData().count()); } }
RDDWrapper<HoodieRecord<HoodieRecordPayload>> hoodieErrorRecords = new RDDWrapper(hoodieRecords, numErrors); hoodieSink.write(hoodieErrorRecords); } catch (IOException ioe) {