public long estimateTotalSize(final RDDWrapper<CassandraPayload> rdd) { final long totalRows = rdd.getCount(); final List<CassandraPayload> sampleRows = rdd.getData().takeSample(true, NO_OF_SAMPLE_ROWS); final long byteSize = sampleRows .stream() .map(element -> element.estimateRowSize()) .reduce((size, accumulator) -> size + accumulator) .orElse(0); final long totalSize = (long) (byteSize * (((totalRows) * 1.0) / (NO_OF_SAMPLE_ROWS))); return totalSize; } }
@Override public JavaRDD<AvroPayload> getData(@NonNull final FileWorkUnitCalculator.FileWorkUnitCalculatorResult result) { Preconditions.checkState(result.hasWorkUnits(), "no work to do: " + this.conf.getDirectory()); // todo: support more types Preconditions.checkState(this.conf.getType().equals("json"), "only json files supported"); try { final FileSystem fs = this.conf.getFileSystem(); final String filesToRead = result.getWorkUnits().stream() .map(LocatedFileStatus::getPath) .map(Path::toString) .collect(Collectors.joining(",")); final RDD<String> fileRows = this.jsc.sc().textFile(filesToRead, 1); return this.converter.map(fileRows.toJavaRDD()).getData(); } catch (IOException e) { throw new JobRuntimeException("Error getting files", e); } } }
final JavaRDD<CassandraPayload> cassandraRecords = payloadWrapper.getData();
public void write(@NonNull final RDDWrapper<HoodieRecord<HoodieRecordPayload>> hoodieRecords) { /** * 1) create new commit -> getOrCreate() * 2) insert records -> bulkInsert() / insert() / upsert() * 3) commit() -> commit() */ this.initDataset(); if (this.hoodieConf.shouldAutoTuneParallelism()) { calculateAndUpdateParallelism(hoodieRecords.getCount()); } final HoodieWriteConfig hoodieWriteConfig = this.hoodieConf.getHoodieWriteConfig(); try (final HoodieWriteClientWrapper hoodieWriteClient = getHoodieWriteClient(hoodieWriteConfig)) { final String commitTime = this.commitTime.isPresent() ? this.commitTime.get() : hoodieWriteClient.startCommit(); // Handle writes to hoodie. It can be an insert or upsert. final HoodieWriteResult result = handleWrite(hoodieWriteClient, hoodieRecords.getData(), commitTime, op); writeRecordsAndErrors(result, true); commit(hoodieWriteClient, commitTime, result.getWriteStatuses()); } }
final String keyspaceName = this.conf.getKeyspace(); JavaRDD<Statement> writtenRdd = payloadWrapper.getData().mapPartitions(iter -> { final Cluster.Builder builder = Cluster.builder().withClusterName(clusterName); if (this.conf.getNativePort().isPresent()) {
.values(); final JavaRDD<AvroPayload> inputRDD = this.dataConverter.map(kafkaData).getData();
@Test public void testCount() { final List<Integer> list1 = Arrays.asList(1,2,3,4,5); final JavaRDD<Integer> list1RDD = this.jsc.get().parallelize(list1); final RDDWrapper<Integer> rddWrapper1 = new RDDWrapper<Integer>(list1RDD); Assert.assertEquals(5, rddWrapper1.getCount()); Assert.assertEquals(5, rddWrapper1.getData().count()); final RDDWrapper<Integer> rddWrapper2 = new RDDWrapper<Integer>(list1RDD, 1); Assert.assertEquals(1, rddWrapper2.getCount()); Assert.assertEquals(5, rddWrapper2.getData().count()); } }
false); JavaRDD<GenericRecord> errorRecords = errorData.getData().map(error -> generateGenericErrorRecord( errorExtractor, errorTableSchema, error, applicationId));
new SparkSourceDataConverter(schema, commonSchema, new Configuration(), Collections.singleton(STRING_FIELD), new ErrorExtractor()); final JavaRDD<AvroPayload> payloadRDD = converter.map(df.javaRDD()).getData();
TimestampInfo.generateEmptyTimestampInfo(), new ErrorExtractor()); final JavaRDD<CassandraPayload> cassRDD = csdc.map(payloadRDD).getData(); final List<CassandraPayload> payloads = cassRDD.collect();