/** * Avoid calling it to fetch top level record fields. */ public GenericRecord getData() { return SparkUtil.deserialize(this.byteRecord, recordClassTag); }
public UtilTable(@NonNull final Class<T> type, @NonNull final JavaRDD<T> javaRDD, @NonNull final Path destPath, final boolean isDatePartitioned, @NonNull final SparkSession sparkSession) { this.spark = sparkSession; final RDD<T> rdd = javaRDD.rdd(); final Encoder<T> bean = Encoders.bean(type); this.dataset = this.spark.createDataset(rdd, bean); this.destPath = destPath; this.isDatePartitioned = isDatePartitioned; }
public static <T, K extends ClassTag<T>> T deserialize(@NonNull final byte[] serializedRecord, @NonNull final K classTag) { return getSerializerInstance().deserialize(ByteBuffer.wrap(serializedRecord), classTag); }
@Test public void testSparkConfOverrideDoesNotFailWithoutAnySparkConfDefinitions() { final Configuration conf = new Configuration(new File(TestConfiguration.CONFIG_YAML), Optional.absent()); SparkUtil .getSparkConf("fooApp", Optional.absent(), Arrays.asList(), conf); } }
public AvroPayload(@NonNull final GenericRecord record, @NonNull final List<String> fieldsToCache) { this.byteRecord = SparkUtil.serialize(record, recordClassTag); for (final String f : fieldsToCache) { this.rootFields.put(f, record.get(f)); } }
Class.class)); SparkUtil.addClassesIfFound(serializableClasses, Arrays.asList( "com.google.common.base.Present",
public final void execute() { this.forkFunction.registerAccumulators(this.inputRDD.rdd().sparkContext()); // Converts JavaRDD<T> -> JavaRDD<List<Integer>, T> JavaRDD<ForkData<DI>> forkedData = this.inputRDD.flatMap(this.forkFunction) .persist(this.persistLevel); final String jobName = SparkJobTracker.getJobName(this.inputRDD.rdd().sparkContext()); forkedData.setName(String.format("%s-%s", jobName, forkedData.id())); // deliberately calling count so that DAG gets executed. final long processedRecords = forkedData.count(); final Optional<RDDInfo> rddInfo = SparkUtil.getRddInfo(forkedData.context(), forkedData.id()); log.info("#processed records :{} name:{}", processedRecords, forkedData.name()); if (rddInfo.isPresent()) { final long size = rddInfo.get().diskSize() + rddInfo.get().memSize(); log.info("rddInfo -> name:{} partitions:{} size:{}", forkedData.name(), rddInfo.get().numPartitions(), size); } this.groupRDD = Optional.of(forkedData); }
/** * Creates JavaSparkContext if its hasn't been created yet, or returns the instance. {@link #addSchema(Schema)} and * {@link #addSchemas(Collection)} must not be called once the JavaSparkContext has been created * @return the JavaSparkContext that will be used to execute the JobDags */ public JavaSparkContext getOrCreateSparkContext() { if (!this.sparkContext.isPresent()) { this.sparkContext = Optional.of(new JavaSparkContext( SparkUtil.getSparkConf( this.appName, Optional.of(this.schemas), this.serializationClasses, this.conf))); this.sparkContext.get().sc().addSparkListener(new SparkEventListener()); // Adding hadoop configuration to default this.sparkContext.get().sc().hadoopConfiguration().addResource( new HadoopConfiguration(conf).getHadoopConf()); this.appId = this.sparkContext.get().sc().applicationId(); } return this.sparkContext.get(); }
public AvroPayload(@NonNull final GenericRecord record) { this.byteRecord = SparkUtil.serialize(record, recordClassTag); for (final Schema.Field f : record.getSchema().getFields()) { if (!RECORD.equals(f.schema().getType())) { this.rootFields.put(f.name(), record.get(f.name())); } } }
Utf8.class, Class.class)); addClassesIfFound(serializableClasses, Arrays.asList("com.google.common.base.Present", "scala.reflect.ClassTag$$anon$1"));
@Test public void testSparkConfOverride() { final Configuration conf = new Configuration( TestSparkUtil.class.getResourceAsStream("/configWithScopes.yaml"), Optional.of("incremental")); final SparkConf sparkConf = SparkUtil .getSparkConf("fooApp", Optional.absent(), Arrays.asList(), conf); Assert.assertEquals("4g", sparkConf.get("spark.executor.memory")); Assert.assertEquals("4g", sparkConf.get("spark.driver.memory")); Assert.assertEquals("100s", sparkConf.get("spark.network.timeout")); }
public static <T, K extends ClassTag<T>> byte[] serialize(@NonNull final T record, @NonNull final K classTag) { return getSerializerInstance().serialize(record, classTag).array(); }