public long estimateTotalSize(final RDDWrapper<CassandraPayload> rdd) { final long totalRows = rdd.getCount(); final List<CassandraPayload> sampleRows = rdd.getData().takeSample(true, NO_OF_SAMPLE_ROWS); final long byteSize = sampleRows .stream() .map(element -> element.estimateRowSize()) .reduce((size, accumulator) -> size + accumulator) .orElse(0); final long totalSize = (long) (byteSize * (((totalRows) * 1.0) / (NO_OF_SAMPLE_ROWS))); return totalSize; } }
/** * This method calculate data size in megabytes * If total row number > ROW_SAMPLE_THRESHOLD * => It samples data to row number = ROW_SAMPLE_THRESHOLD * => Calculate sample data size by {@link FileSink#getSampleSizeInBytes(JavaRDD)} * => Calculate total data sizes by fraction and change to megabyte * Else calculate total data size by {@link FileSink#getSampleSizeInBytes(JavaRDD)} * * @param data data to calculate size in megabytes * @return estimated data size in megabytes */ protected double getRddSizeInMegaByte(@NonNull final JavaRDD<String> data) { final RDDWrapper<String> dataWrapper = new RDDWrapper<>(data); final long totalRows = dataWrapper.getCount(); final double totalSize; if (totalRows > ROW_SAMPLING_THRESHOLD) { log.debug("Start sampling on Write Data."); final double fraction = (double) ROW_SAMPLING_THRESHOLD / (double) totalRows; log.debug("Sample fraction: {}", fraction); final JavaRDD<String> sampleRdd = data.sample(false, fraction); final long sampleSizeInBytes = getSampleSizeInBytes(sampleRdd); final double sampleSizeInMB = (double) sampleSizeInBytes / FileUtils.ONE_MB; totalSize = sampleSizeInMB / fraction; } else { totalSize = (double) getSampleSizeInBytes(data) / FileUtils.ONE_MB; } return totalSize; }
final RDDWrapper<CassandraPayload> payloadWrapper = this.converter.map(data); if (payloadWrapper.getCount() == 0) { tags.put(TABLE_NAME_TAG, this.conf.getKeyspace() + StringTypes.UNDERSCORE + this.conf.getTableName()); this.tableMetrics.get() .createLongMetric(DataFeedMetricNames.OUTPUT_ROWCOUNT, payloadWrapper.getCount(), tags);
tags.put(SINK_INFO_TAG, this.conf.getSinkType().name()); final RDDWrapper<String> dataWrapper = new RDDWrapper<>(dataToWrite); final long totalRows = dataWrapper.getCount(); this.dataFeedMetrics.get().createLongMetric(DataFeedMetricNames.OUTPUT_ROWCOUNT, totalRows, tags);
final RDDWrapper<Statement> payloadWrapper = this.converter.map(data); if (payloadWrapper.getCount() == 0) { tags.put(TABLE_NAME_TAG, this.conf.getKeyspace() + StringTypes.UNDERSCORE + this.conf.getTableName()); this.tableMetrics.get() .createLongMetric(DataFeedMetricNames.OUTPUT_ROWCOUNT, payloadWrapper.getCount(), tags);
public void write(@NonNull final RDDWrapper<HoodieRecord<HoodieRecordPayload>> hoodieRecords) { /** * 1) create new commit -> getOrCreate() * 2) insert records -> bulkInsert() / insert() / upsert() * 3) commit() -> commit() */ this.initDataset(); if (this.hoodieConf.shouldAutoTuneParallelism()) { calculateAndUpdateParallelism(hoodieRecords.getCount()); } final HoodieWriteConfig hoodieWriteConfig = this.hoodieConf.getHoodieWriteConfig(); try (final HoodieWriteClientWrapper hoodieWriteClient = getHoodieWriteClient(hoodieWriteConfig)) { final String commitTime = this.commitTime.isPresent() ? this.commitTime.get() : hoodieWriteClient.startCommit(); // Handle writes to hoodie. It can be an insert or upsert. final HoodieWriteResult result = handleWrite(hoodieWriteClient, hoodieRecords.getData(), commitTime, op); writeRecordsAndErrors(result, true); commit(hoodieWriteClient, commitTime, result.getWriteStatuses()); } }
@Test public void testExceptionHandling() { final int successRecords = 5; final int invalidDataRecords = 7; final int runtimeExceptionRecords = 1; final List<String> inputList = new ArrayList<>(); // Adding only success & invalid_data records. IntStream.range(0, successRecords).forEach(i -> inputList.add(SUCCESS)); IntStream.range(0, invalidDataRecords).forEach(i -> inputList.add(INVALID_DATA)); final MockAbstractDataConverter mockConverter = new MockAbstractDataConverter(new Configuration(), new ErrorExtractor()); final RDDWrapper<String> result = mockConverter.map(this.jsc.get().parallelize(inputList)); Assert.assertEquals(successRecords, result.getCount()); // Adding runtime exception records. This should fail the spark job. IntStream.range(0, runtimeExceptionRecords).forEach(i -> inputList.add(RUNTIME_EXCEPTION)); try { mockConverter.map(this.jsc.get().parallelize(inputList)); Assert.fail("expecting error here"); } catch (Exception e) { Assert.assertEquals(SparkException.class, e.getClass()); Assert.assertEquals(JobRuntimeException.class, e.getCause().getClass()); Assert.assertEquals(JobRuntimeException.class, e.getCause().getCause().getClass()); Assert.assertEquals(RUNTIME_EXCEPTION, e.getCause().getCause().getMessage()); } }
@Test public void testCount() { final List<Integer> list1 = Arrays.asList(1,2,3,4,5); final JavaRDD<Integer> list1RDD = this.jsc.get().parallelize(list1); final RDDWrapper<Integer> rddWrapper1 = new RDDWrapper<Integer>(list1RDD); Assert.assertEquals(5, rddWrapper1.getCount()); Assert.assertEquals(5, rddWrapper1.getData().count()); final RDDWrapper<Integer> rddWrapper2 = new RDDWrapper<Integer>(list1RDD, 1); Assert.assertEquals(1, rddWrapper2.getCount()); Assert.assertEquals(5, rddWrapper2.getData().count()); } }
return; final long numErrors = errorData.getCount(); log.info("number of Errors : {}", numErrors); if (numErrors == 0) {