/** * Gets extractor instance. * * @param config the config * @return the extractor instance */ public static <T, S extends BaseConfig> IExtractor<T, S> getExtractorInstance(S config) { try { Class<T> rdd = (Class<T>) config.getExtractorImplClass(); if (rdd == null) { rdd = (Class<T>) Class.forName(config.getExtractorImplClassName()); } Constructor<T> c; if (config.getEntityClass().isAssignableFrom(Cells.class)) { c = rdd.getConstructor(); return (IExtractor<T, S>) c.newInstance(); } else { c = rdd.getConstructor(Class.class); return (IExtractor<T, S>) c.newInstance(config.getEntityClass()); } } catch (ClassNotFoundException | InstantiationException | IllegalAccessException | IllegalArgumentException | InvocationTargetException | NoSuchMethodException | SecurityException e) { String message = "A exception happens and we wrap with DeepExtractorInitializationException" + e.getMessage(); LOG.error(message); throw new DeepExtractorInitializationException(message,e); } }
/** * * @param rdd * @param config * @param queryBuilder * @param <T> * @param <S> */ public static <T, S extends BaseConfig> void saveRDD(RDD<T> rdd, S config, UpdateQueryBuilder queryBuilder) { config.setRddId(rdd.id()); config.setPartitionId(0); rdd.foreachPartition(new PrepareSaveFunction<>(queryBuilder, config, rdd.first())); }
public DeepRDD(SparkContext sc, S config) { super(sc, scala.collection.Seq$.MODULE$.empty(), ClassTag$.MODULE$.<T>apply(config .getEntityClass())); config.setRddId(id()); this.config = sc.broadcast(config, ClassTag$.MODULE$ .<S>apply(config.getClass())); }
@Override public BoxedUnit apply(Iterator<T> v1) { IExtractor<T, S> extractor; try { extractor = getExtractorInstance(config); } catch (DeepExtractorInitializationException e) { extractor = getExtractorClient(); } extractor.initSave(config, first, queryBuilder); while (v1.hasNext()) { extractor.saveRDD(v1.next()); } config.setPartitionId(config.getPartitionId() + 1); extractor.close(); return null; }
@Override public void initSave(S config, T first, UpdateQueryBuilder queryBuilder) { int id = config.getRddId(); int partitionIndex = config.getPartitionId(); TaskAttemptID attemptId = DeepSparkHadoopMapReduceUtil .newTaskAttemptID(jobTrackerId, id, true, partitionIndex, 0); Configuration configuration = getHadoopConfig(config); hadoopAttemptContext = DeepSparkHadoopMapReduceUtil .newTaskAttemptContext(configuration, attemptId); try { writer = outputFormat.getRecordWriter(hadoopAttemptContext); } catch (IOException | InterruptedException e) { throw new DeepGenericException(e); } }
/** * Test get extractor instance. * * @throws Exception the exception */ @Test public void testGetExtractorInstance() throws Exception { BaseConfig<Cells, BaseConfig> baseConfig = new BaseConfig<>(); baseConfig.setEntityClass(Cells.class); baseConfig.setExtractorImplClass(testExtractor.class); IExtractor extractorInstance1 = getExtractorInstance(baseConfig); assertNotNull(extractorInstance1); }
public DeepJavaRDD(DeepRDD<T, S> rdd) { super(rdd, ClassTag$.MODULE$.<T>apply(rdd.config.value().getEntityClass())); }
@Override public Partition[] getPartitions(S config) { int id = config.getRddId(); jobId = new JobID(jobTrackerId, id); Configuration conf = getHadoopConfig(config); JobContext jobContext = DeepSparkHadoopMapReduceUtil.newJobContext(conf, jobId); try { List<InputSplit> splits = inputFormat.getSplits(jobContext); Partition[] partitions = new Partition[(splits.size())]; for (int i = 0; i < splits.size(); i++) { partitions[i] = new NewHadoopPartition(id, i, splits.get(i)); } return partitions; } catch (IOException | InterruptedException | RuntimeException e) { LOG.error("Impossible to calculate partitions " + e.getMessage()); throw new DeepGenericException("Impossible to calculate partitions ", e); } }
public DeepRDD(SparkContext sc, S config) { super(sc, scala.collection.Seq$.MODULE$.empty(), ClassTag$.MODULE$.<T>apply(config .getEntityClass())); config.setRddId(id()); this.config = sc.broadcast(config, ClassTag$.MODULE$ .<S>apply(config.getClass())); }
@Override public BoxedUnit apply(Iterator<T> v1) { IExtractor<T, S> extractor; try { extractor = getExtractorInstance(config); } catch (DeepExtractorInitializationException e) { extractor = getExtractorClient(); } extractor.initSave(config, first, queryBuilder); while (v1.hasNext()) { extractor.saveRDD(v1.next()); } config.setPartitionId(config.getPartitionId() + 1); extractor.close(); return null; }
@Override public void initSave(S config, T first, UpdateQueryBuilder queryBuilder) { int id = config.getRddId(); int partitionIndex = config.getPartitionId(); TaskAttemptID attemptId = DeepSparkHadoopMapReduceUtil .newTaskAttemptID(jobTrackerId, id, true, partitionIndex, 0); Configuration configuration = getHadoopConfig(config); hadoopAttemptContext = DeepSparkHadoopMapReduceUtil .newTaskAttemptContext(configuration, attemptId); try { writer = outputFormat.getRecordWriter(hadoopAttemptContext); } catch (IOException | InterruptedException e) { throw new DeepGenericException(e); } }
public DeepJavaRDD(DeepRDD<T, S> rdd) { super(rdd, ClassTag$.MODULE$.<T>apply(rdd.config.value().getEntityClass())); }
@Override public Partition[] getPartitions(S config) { int id = config.getRddId(); jobId = new JobID(jobTrackerId, id); Configuration conf = getHadoopConfig(config); JobContext jobContext = DeepSparkHadoopMapReduceUtil.newJobContext(conf, jobId); try { List<InputSplit> splits = inputFormat.getSplits(jobContext); Partition[] partitions = new Partition[(splits.size())]; for (int i = 0; i < splits.size(); i++) { partitions[i] = new NewHadoopPartition(id, i, splits.get(i)); } return partitions; } catch (IOException | InterruptedException | RuntimeException e) { LOG.error("Impossible to calculate partitions " + e.getMessage()); throw new DeepGenericException("Impossible to calculate partitions ", e); } }
/** * Gets extractor instance. * * @param config the config * @return the extractor instance */ public static <T, S extends BaseConfig> IExtractor<T, S> getExtractorInstance(S config) { try { Class<T> rdd = (Class<T>) config.getExtractorImplClass(); if (rdd == null) { rdd = (Class<T>) Class.forName(config.getExtractorImplClassName()); } Constructor<T> c; if (config.getEntityClass().isAssignableFrom(Cells.class)) { c = rdd.getConstructor(); return (IExtractor<T, S>) c.newInstance(); } else { c = rdd.getConstructor(Class.class); return (IExtractor<T, S>) c.newInstance(config.getEntityClass()); } } catch (ClassNotFoundException | InstantiationException | IllegalAccessException | IllegalArgumentException | InvocationTargetException | NoSuchMethodException | SecurityException e) { String message = "A exception happens and we wrap with DeepExtractorInitializationException" + e.getMessage(); LOG.error(message); throw new DeepExtractorInitializationException(message,e); } }
@Override public ClassTag<T> classTag() { return ClassTag$.MODULE$.<T>apply(((BaseConfig<T,BaseConfig>)((DeepRDD) this.rdd()).config.value()) .getEntityClass()); }
/** * * @param rdd * @param config * @param queryBuilder * @param <T> * @param <S> */ public static <T, S extends BaseConfig> void saveRDD(RDD<T> rdd, S config, UpdateQueryBuilder queryBuilder) { config.setRddId(rdd.id()); config.setPartitionId(0); rdd.foreachPartition(new PrepareSaveFunction<>(queryBuilder, config, rdd.first())); }
@Override public void initIterator(Partition dp, S config) { int id = config.getRddId(); NewHadoopPartition split = (NewHadoopPartition) dp; TaskAttemptID attemptId = DeepSparkHadoopMapReduceUtil .newTaskAttemptID(jobTrackerId, id, true, split.index(), 0); Configuration configuration = getHadoopConfig(config); TaskAttemptContext hadoopAttemptContext = DeepSparkHadoopMapReduceUtil .newTaskAttemptContext(configuration, attemptId); try { reader = inputFormat.createRecordReader(split.serializableHadoopSplit().value(), hadoopAttemptContext); reader.initialize(split.serializableHadoopSplit().value(), hadoopAttemptContext); } catch (IOException | InterruptedException e) { throw new DeepGenericException(e); } }
@Override public ClassTag<T> classTag() { return ClassTag$.MODULE$.<T>apply(((BaseConfig<T,BaseConfig>)((DeepRDD) this.rdd()).config.value()) .getEntityClass()); }
@Override public void initIterator(Partition dp, S config) { int id = config.getRddId(); NewHadoopPartition split = (NewHadoopPartition) dp; TaskAttemptID attemptId = DeepSparkHadoopMapReduceUtil .newTaskAttemptID(jobTrackerId, id, true, split.index(), 0); Configuration configuration = getHadoopConfig(config); TaskAttemptContext hadoopAttemptContext = DeepSparkHadoopMapReduceUtil .newTaskAttemptContext(configuration, attemptId); try { reader = inputFormat.createRecordReader(split.serializableHadoopSplit().value(), hadoopAttemptContext); reader.initialize(split.serializableHadoopSplit().value(), hadoopAttemptContext); } catch (IOException | InterruptedException e) { throw new DeepGenericException(e); } }