public ParquetWorkUnitCalculator(@NonNull final HiveSourceConfiguration hiveConf, @NonNull final FileSystem fs) throws IOException { this.hiveConf = hiveConf; final PartitionType partitionType = hiveConf.getPartitionType(); log.info("Create partition manger with partition type: {}", partitionType); if (partitionType.equals(PartitionType.NONE) || partitionType.equals(PartitionType.NORMAL)) { // create partition manager internally this.partitionManager = new HDFSPartitionManager(hiveConf.getJobName(), hiveConf.getBaseMetadataPath(), hiveConf.getDataPath(), fs); } else if (partitionType.equals(PartitionType.DATE)) { this.partitionManager = new HDFSDatePartitionManager(hiveConf.getJobName(), hiveConf.getBaseMetadataPath(), hiveConf.getDataPath(), hiveConf.getPartitionKeyName().get(), getHiveConf().getStartDate(), fs); } else { throw new JobRuntimeException("Error: Partition type is not supported. Partition type: " + partitionType); } }
public HiveSourceConfiguration(@NonNull final Configuration conf) { super(conf); this.saveCheckpoint = this.getConf().getBooleanProperty(SAVE_CHECKPOINT, true); this.startDate = getConf().getProperty(HIVE_START_DATE).isPresent() ? Optional.of(DateTime.parse(getConf().getProperty(HIVE_START_DATE).get(), DateTimeFormat.forPattern(HIVE_START_DATE_FORMAT).withZoneUTC()).toDate()) : Optional.absent(); }
@Test public void testBasicConfig() { final Configuration config = getValidHiveSourceConfiguration(); final HiveSourceConfiguration hiveConfig = new HiveSourceConfiguration(config); Assert.assertEquals(JOB_NAME, hiveConfig.getJobName()); Assert.assertEquals(DEFAULT_DATA_PATH, hiveConfig.getDataPath()); Assert.assertEquals(DEFAULT_METADATA_PATH, hiveConfig.getBaseMetadataPath()); Assert.assertTrue(hiveConfig.shouldSaveCheckpoint()); }
@Override public JavaRDD<AvroPayload> getData(@NonNull final ParquetWorkUnitCalculatorResult workUnitCalcResult) { Preconditions.checkState(workUnitCalcResult.hasWorkUnits(), "No work to process for: " + hiveConf.getDataPath()); /** * Current implementation of HiveSource assumes that only a single work unit exists which * corresponds to the single partition that is processed per job. */ final List<String> workUnits = workUnitCalcResult.getWorkUnits(); final String hdfsPath = new Path(this.hiveConf.getDataPath(), workUnits.get(0)).toString(); log.info("Reading data from path: {}", hdfsPath); final Dataset<Row> data = this.sqlContext.read().parquet(hdfsPath); final int numPartitions = calculateHiveNumPartitions(data); log.info("Using {} partitions", numPartitions); final JavaRDD<AvroPayload> hiveRawData = data .coalesce(numPartitions) .javaRDD() .flatMap(row -> { final List<AvroPayload> payloads = new ArrayList<>(); this.converter.convert(row).forEach(d -> payloads.add(d.getSuccessData().get().getData())); return payloads.iterator(); }); return hiveRawData; }
@Test(expected = MissingPropertyException.class) public void testMissingHiveDataPath() { final Configuration config = new Configuration(); config.setProperty(HiveSourceConfiguration.JOB_NAME, JOB_NAME); final HiveSourceConfiguration hiveConfig = new HiveSourceConfiguration(config); Assert.fail(); }
public static HiveSourceConfiguration initializeConfig(final String jobName, final String dataPath, final String metadataPath) { final Configuration config = new Configuration(); config.setProperty(HiveSourceConfiguration.JOB_NAME, jobName); config.setProperty(HiveSourceConfiguration.HIVE_DATA_PATH, dataPath); config.setProperty(HiveSourceConfiguration.BASE_METADATA_PATH, metadataPath); return new HiveSourceConfiguration(config); } }
HiveTestUtil.initializeConfig(JOB_NAME, dataPath, "testMetadataPath"); final SparkSourceDataConverter converter = new SparkSourceDataConverter(dfSchema, avroSchema, hiveConf.getConf(), Sets.newHashSet(LEFT_FIELD, RIGHT_FIELD), new ErrorExtractor()); final HiveSource source = new HiveSource(hiveConf, this.sqlContext.get(), converter);
@Test(expected = MissingPropertyException.class) public void testMissingJobName() { final Configuration config = new Configuration(); config.setProperty(HiveSourceConfiguration.HIVE_DATA_PATH, DEFAULT_DATA_PATH); final HiveSourceConfiguration hiveConfig = new HiveSourceConfiguration(config); Assert.fail(); }
@Before public void setupTest() throws IOException { this.config = new Configuration(); this.fileSystem = FSUtils.getFs(this.config); this.dataPath = FileTestUtil.getTempFolder(); this.metadataPath = FileTestUtil.getTempFolder(); this.config.setProperty(HiveSourceConfiguration.JOB_NAME, JOB_NAME); this.config.setProperty(HiveSourceConfiguration.BASE_METADATA_PATH, this.metadataPath); this.config.setProperty(HiveSourceConfiguration.HIVE_DATA_PATH, this.dataPath); this.config.setProperty(HiveSourceConfiguration.PARTITION_TYPE, PartitionType.NORMAL.toString()); this.hiveConfig = new HiveSourceConfiguration(this.config); }