@Test public void testCount() { final List<Integer> list1 = Arrays.asList(1,2,3,4,5); final JavaRDD<Integer> list1RDD = this.jsc.get().parallelize(list1); final RDDWrapper<Integer> rddWrapper1 = new RDDWrapper<Integer>(list1RDD); Assert.assertEquals(5, rddWrapper1.getCount()); Assert.assertEquals(5, rddWrapper1.getData().count()); final RDDWrapper<Integer> rddWrapper2 = new RDDWrapper<Integer>(list1RDD, 1); Assert.assertEquals(1, rddWrapper2.getCount()); Assert.assertEquals(5, rddWrapper2.getData().count()); } }
public long estimateTotalSize(final RDDWrapper<CassandraPayload> rdd) { final long totalRows = rdd.getCount(); final List<CassandraPayload> sampleRows = rdd.getData().takeSample(true, NO_OF_SAMPLE_ROWS); final long byteSize = sampleRows .stream() .map(element -> element.estimateRowSize()) .reduce((size, accumulator) -> size + accumulator) .orElse(0); final long totalSize = (long) (byteSize * (((totalRows) * 1.0) / (NO_OF_SAMPLE_ROWS))); return totalSize; } }
/** * This method calculate data size in megabytes * If total row number > ROW_SAMPLE_THRESHOLD * => It samples data to row number = ROW_SAMPLE_THRESHOLD * => Calculate sample data size by {@link FileSink#getSampleSizeInBytes(JavaRDD)} * => Calculate total data sizes by fraction and change to megabyte * Else calculate total data size by {@link FileSink#getSampleSizeInBytes(JavaRDD)} * * @param data data to calculate size in megabytes * @return estimated data size in megabytes */ protected double getRddSizeInMegaByte(@NonNull final JavaRDD<String> data) { final RDDWrapper<String> dataWrapper = new RDDWrapper<>(data); final long totalRows = dataWrapper.getCount(); final double totalSize; if (totalRows > ROW_SAMPLING_THRESHOLD) { log.debug("Start sampling on Write Data."); final double fraction = (double) ROW_SAMPLING_THRESHOLD / (double) totalRows; log.debug("Sample fraction: {}", fraction); final JavaRDD<String> sampleRdd = data.sample(false, fraction); final long sampleSizeInBytes = getSampleSizeInBytes(sampleRdd); final double sampleSizeInMB = (double) sampleSizeInBytes / FileUtils.ONE_MB; totalSize = sampleSizeInMB / fraction; } else { totalSize = (double) getSampleSizeInBytes(data) / FileUtils.ONE_MB; } return totalSize; }
@Override public JavaRDD<AvroPayload> getData(@NonNull final FileWorkUnitCalculator.FileWorkUnitCalculatorResult result) { Preconditions.checkState(result.hasWorkUnits(), "no work to do: " + this.conf.getDirectory()); // todo: support more types Preconditions.checkState(this.conf.getType().equals("json"), "only json files supported"); try { final FileSystem fs = this.conf.getFileSystem(); final String filesToRead = result.getWorkUnits().stream() .map(LocatedFileStatus::getPath) .map(Path::toString) .collect(Collectors.joining(",")); final RDD<String> fileRows = this.jsc.sc().textFile(filesToRead, 1); return this.converter.map(fileRows.toJavaRDD()).getData(); } catch (IOException e) { throw new JobRuntimeException("Error getting files", e); } } }
public void writeRecordsAndErrors(@NonNull final HoodieWriteResult result, final boolean isErrorTableEnabled) { try { if (result.getException().isPresent()) { throw result.getException().get(); } if (result.getWriteStatuses().isPresent()) { if (isErrorTableEnabled) { // TODO: Can we make this more readable, please? final JavaRDD<Tuple2<HoodieRecord, String>> hoodieRecordAndErrorTupleRDD = result.getWriteStatuses().get() .flatMap(ws -> ws.getFailedRecords().stream().map(fr -> new Tuple2<>(fr, ws.getErrors().get(fr.getKey()).getMessage())).iterator()); final JavaRDD<ErrorData> errorRDD = hoodieRecordAndErrorTupleRDD .map(r -> new ErrorData(r._2, RawDataHelper.getRawData(r._1))); ErrorTableUtil.writeErrorRecordsToErrorTable(this.jsc.sc(), this.hoodieConf.getConf(), Optional.of(this.hoodieConf.getTableName()), new RDDWrapper<>(errorRDD), new HoodieSinkErrorExtractor()); } } } catch (HoodieInsertException | HoodieUpsertException e) { log.error("Error writing to hoodie", e); throw new JobRuntimeException("hoodie write failed :" + (result.getWriteStatuses().isPresent() ? result.getWriteStatuses().get().count() : -1), e); } catch (Exception e) { throw new JobRuntimeException("Error writing to hoodie", e); } }
@Test public void testExceptionHandling() { final int successRecords = 5; final int invalidDataRecords = 7; final int runtimeExceptionRecords = 1; final List<String> inputList = new ArrayList<>(); // Adding only success & invalid_data records. IntStream.range(0, successRecords).forEach(i -> inputList.add(SUCCESS)); IntStream.range(0, invalidDataRecords).forEach(i -> inputList.add(INVALID_DATA)); final MockAbstractDataConverter mockConverter = new MockAbstractDataConverter(new Configuration(), new ErrorExtractor()); final RDDWrapper<String> result = mockConverter.map(this.jsc.get().parallelize(inputList)); Assert.assertEquals(successRecords, result.getCount()); // Adding runtime exception records. This should fail the spark job. IntStream.range(0, runtimeExceptionRecords).forEach(i -> inputList.add(RUNTIME_EXCEPTION)); try { mockConverter.map(this.jsc.get().parallelize(inputList)); Assert.fail("expecting error here"); } catch (Exception e) { Assert.assertEquals(SparkException.class, e.getClass()); Assert.assertEquals(JobRuntimeException.class, e.getCause().getClass()); Assert.assertEquals(JobRuntimeException.class, e.getCause().getCause().getClass()); Assert.assertEquals(RUNTIME_EXCEPTION, e.getCause().getCause().getMessage()); } }
public void write(@NonNull final RDDWrapper<HoodieRecord<HoodieRecordPayload>> hoodieRecords) { /** * 1) create new commit -> getOrCreate() * 2) insert records -> bulkInsert() / insert() / upsert() * 3) commit() -> commit() */ this.initDataset(); if (this.hoodieConf.shouldAutoTuneParallelism()) { calculateAndUpdateParallelism(hoodieRecords.getCount()); } final HoodieWriteConfig hoodieWriteConfig = this.hoodieConf.getHoodieWriteConfig(); try (final HoodieWriteClientWrapper hoodieWriteClient = getHoodieWriteClient(hoodieWriteConfig)) { final String commitTime = this.commitTime.isPresent() ? this.commitTime.get() : hoodieWriteClient.startCommit(); // Handle writes to hoodie. It can be an insert or upsert. final HoodieWriteResult result = handleWrite(hoodieWriteClient, hoodieRecords.getData(), commitTime, op); writeRecordsAndErrors(result, true); commit(hoodieWriteClient, commitTime, result.getWriteStatuses()); } }
final Map<String, String> tags = new HashMap<>(); tags.put(SINK_INFO_TAG, this.conf.getSinkType().name()); final RDDWrapper<String> dataWrapper = new RDDWrapper<>(dataToWrite); final long totalRows = dataWrapper.getCount(); this.dataFeedMetrics.get().createLongMetric(DataFeedMetricNames.OUTPUT_ROWCOUNT, totalRows, tags);
.values(); final JavaRDD<AvroPayload> inputRDD = this.dataConverter.map(kafkaData).getData();
public final RDDWrapper<OD> map(@NonNull final JavaRDD<ID> data) { final ForkOperator<IData> converter = new ForkOperator<>(data.map(r -> RawDataHelper.getRawData(r)), new DataConversionFunction(), this.conf); converter.execute(); // Write error records. ErrorTableUtil.writeErrorRecordsToErrorTable(data.context(), this.conf, Optional.absent(), new RDDWrapper<>(converter.getRDD(ERROR_RECORD).map(r -> (ErrorData) r), converter.getCount(ERROR_RECORD)), errorExtractor); return new RDDWrapper<>(converter.getRDD(VALID_RECORD).map(r -> ((ValidData<OD>) r).getData()), converter.getCount(VALID_RECORD)); }
return; final long numErrors = errorData.getCount(); log.info("number of Errors : {}", numErrors); if (numErrors == 0) { false); JavaRDD<GenericRecord> errorRecords = errorData.getData().map(error -> generateGenericErrorRecord( errorExtractor, errorTableSchema, error, applicationId)); RDDWrapper<HoodieRecord<HoodieRecordPayload>> hoodieErrorRecords = new RDDWrapper(hoodieRecords, numErrors); hoodieSink.write(hoodieErrorRecords); } catch (IOException ioe) {
final RDDWrapper<CassandraPayload> payloadWrapper = this.converter.map(data); if (payloadWrapper.getCount() == 0) { tags.put(TABLE_NAME_TAG, this.conf.getKeyspace() + StringTypes.UNDERSCORE + this.conf.getTableName()); this.tableMetrics.get() .createLongMetric(DataFeedMetricNames.OUTPUT_ROWCOUNT, payloadWrapper.getCount(), tags); final JavaRDD<CassandraPayload> cassandraRecords = payloadWrapper.getData();
new SparkSourceDataConverter(schema, commonSchema, new Configuration(), Collections.singleton(STRING_FIELD), new ErrorExtractor()); final JavaRDD<AvroPayload> payloadRDD = converter.map(df.javaRDD()).getData();
final RDDWrapper<Statement> payloadWrapper = this.converter.map(data); if (payloadWrapper.getCount() == 0) { tags.put(TABLE_NAME_TAG, this.conf.getKeyspace() + StringTypes.UNDERSCORE + this.conf.getTableName()); this.tableMetrics.get() .createLongMetric(DataFeedMetricNames.OUTPUT_ROWCOUNT, payloadWrapper.getCount(), tags); final String keyspaceName = this.conf.getKeyspace(); JavaRDD<Statement> writtenRdd = payloadWrapper.getData().mapPartitions(iter -> { final Cluster.Builder builder = Cluster.builder().withClusterName(clusterName); if (this.conf.getNativePort().isPresent()) {
TimestampInfo.generateEmptyTimestampInfo(), new ErrorExtractor()); final JavaRDD<CassandraPayload> cassRDD = csdc.map(payloadRDD).getData(); final List<CassandraPayload> payloads = cassRDD.collect();