public HadoopIngestionSpec withDataSchema(DataSchema schema) { return new HadoopIngestionSpec( schema, ioConfig, tuningConfig, uniqueId ); }
public void verify() { Preconditions.checkNotNull(schema.getDataSchema().getDataSource(), "dataSource"); Preconditions.checkNotNull(schema.getDataSchema().getParser().getParseSpec(), "parseSpec"); Preconditions.checkNotNull(schema.getDataSchema().getParser().getParseSpec().getTimestampSpec(), "timestampSpec"); Preconditions.checkNotNull(schema.getDataSchema().getGranularitySpec(), "granularitySpec"); Preconditions.checkNotNull(pathSpec, "inputSpec"); Preconditions.checkNotNull(schema.getTuningConfig().getWorkingPath(), "workingPath"); Preconditions.checkNotNull(schema.getIOConfig().getSegmentOutputPath(), "segmentOutputPath"); Preconditions.checkNotNull(schema.getTuningConfig().getVersion(), "version"); }
@Override public void run() { try { Injector injector = makeInjector(); config = getHadoopDruidIndexerConfig(); MetadataStorageUpdaterJobSpec metadataSpec = config.getSchema().getIOConfig().getMetadataUpdateSpec(); // override metadata storage type based on HadoopIOConfig Preconditions.checkNotNull(metadataSpec.getType(), "type in metadataUpdateSpec must not be null"); injector.getInstance(Properties.class).setProperty("druid.metadata.storage.type", metadataSpec.getType()); config = HadoopDruidIndexerConfig.fromSpec( HadoopIngestionSpec.updateSegmentListIfDatasourcePathSpecIsUsed( config.getSchema(), HadoopDruidIndexerConfig.JSON_MAPPER, new MetadataStoreBasedUsedSegmentLister( injector.getInstance(IndexerMetadataStorageCoordinator.class) ) ) ); List<Jobby> jobs = new ArrayList<>(); jobs.add(new HadoopDruidDetermineConfigurationJob(config)); jobs.add(new HadoopDruidIndexerJob(config, injector.getInstance(MetadataStorageUpdaterJobHandler.class))); JobHelper.runJobs(jobs, config); } catch (Exception e) { throw Throwables.propagate(e); } }
public void setGranularitySpec(GranularitySpec granularitySpec) { this.schema = schema.withDataSchema(schema.getDataSchema().withGranularitySpec(granularitySpec)); this.pathSpec = JSON_MAPPER.convertValue(schema.getIOConfig().getPathSpec(), PathSpec.class); }
public void setShardSpecs(Map<Long, List<HadoopyShardSpec>> shardSpecs) { this.schema = schema.withTuningConfig(schema.getTuningConfig().withShardSpecs(shardSpecs)); this.pathSpec = JSON_MAPPER.convertValue(schema.getIOConfig().getPathSpec(), PathSpec.class); }
public GranularitySpec getGranularitySpec() { return schema.getDataSchema().getGranularitySpec(); }
public void addJobProperties(Configuration conf) { for (final Map.Entry<String, String> entry : schema.getTuningConfig().getJobProperties().entrySet()) { conf.set(entry.getKey(), entry.getValue()); } }
public String runTask(String[] args) throws Exception { final String schema = args[0]; final String workingPath = args[1]; final String segmentOutputPath = args[2]; final String hadoopJobIdFile = args[3]; final HadoopIngestionSpec theSchema = HadoopDruidIndexerConfig.JSON_MAPPER .readValue( schema, HadoopIngestionSpec.class ); final HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig.fromSpec( theSchema .withIOConfig(theSchema.getIOConfig().withSegmentOutputPath(segmentOutputPath)) .withTuningConfig(theSchema.getTuningConfig().withWorkingPath(workingPath)) ); job = new HadoopDruidDetermineConfigurationJob(config); job.setHadoopJobIdFile(hadoopJobIdFile); log.info("Starting a hadoop determine configuration job..."); if (job.run()) { return HadoopDruidIndexerConfig.JSON_MAPPER.writeValueAsString( new HadoopDetermineConfigInnerProcessingStatus(config.getSchema(), job.getStats(), null) ); } else { return HadoopDruidIndexerConfig.JSON_MAPPER.writeValueAsString( new HadoopDetermineConfigInnerProcessingStatus(null, job.getStats(), job.getErrorMessage()) ); } }
this.spec.getIOConfig().getSegmentOutputPath() == null, "segmentOutputPath must be absent" ); Preconditions.checkArgument(this.spec.getTuningConfig().getWorkingPath() == null, "workingPath must be absent"); Preconditions.checkArgument( this.spec.getIOConfig().getMetadataUpdateSpec() == null, "metadataUpdateSpec must be absent" );
boolean determineIntervals = !spec.getDataSchema().getGranularitySpec().bucketIntervals().isPresent(); spec = HadoopIngestionSpec.updateSegmentListIfDatasourcePathSpecIsUsed( spec, jsonMapper, Interval interval = JodaUtils.umbrellaInterval( JodaUtils.condenseIntervals( indexerSchema.getDataSchema().getGranularitySpec().bucketIntervals().get() final String specVersion = indexerSchema.getTuningConfig().getVersion(); if (indexerSchema.getTuningConfig().isUseExplicitVersion()) { if (specVersion.compareTo(version) < 0) { version = specVersion;
/** * Make the intermediate path for this job run. * * @return the intermediate path for this job run. */ public Path makeIntermediatePath() { return new Path( StringUtils.format( "%s/%s/%s_%s", getWorkingPath(), schema.getDataSchema().getDataSource(), StringUtils.removeChar(schema.getTuningConfig().getVersion(), ':'), schema.getUniqueId() ) ); }
public boolean isUpdaterJobSpecSet() { return (schema.getIOConfig().getMetadataUpdateSpec() != null); }
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); aggregators = config.getSchema().getDataSchema().getAggregators(); if (DatasourcePathSpec.checkIfReindexingAndIsUseAggEnabled(config.getSchema().getIOConfig().getPathSpec())) { aggsForSerializingSegmentInputRow = aggregators; } else { // Note: this is required for "delta-ingestion" use case where we are reading rows stored in Druid as well // as late arriving data on HDFS etc. aggsForSerializingSegmentInputRow = new AggregatorFactory[aggregators.length]; for (int i = 0; i < aggregators.length; ++i) { aggsForSerializingSegmentInputRow[i] = aggregators[i].getCombiningFactory(); } } typeHelperMap = InputRowSerde.getTypeHelperMap(config.getSchema() .getDataSchema() .getParser() .getParseSpec() .getDimensionsSpec()); }
private static IncrementalIndex makeIncrementalIndex( Bucket theBucket, AggregatorFactory[] aggs, HadoopDruidIndexerConfig config, Iterable<String> oldDimOrder, Map<String, ColumnCapabilitiesImpl> oldCapabilities ) { final HadoopTuningConfig tuningConfig = config.getSchema().getTuningConfig(); final IncrementalIndexSchema indexSchema = new IncrementalIndexSchema.Builder() .withMinTimestamp(theBucket.time.getMillis()) .withTimestampSpec(config.getSchema().getDataSchema().getParser().getParseSpec().getTimestampSpec()) .withDimensionsSpec(config.getSchema().getDataSchema().getParser()) .withQueryGranularity(config.getSchema().getDataSchema().getGranularitySpec().getQueryGranularity()) .withMetrics(aggs) .withRollup(config.getSchema().getDataSchema().getGranularitySpec().isRollup()) .build(); IncrementalIndex newIndex = new IncrementalIndex.Builder() .setIndexSchema(indexSchema) .setReportParseExceptions(!tuningConfig.isIgnoreInvalidRows()) // only used by OffHeapIncrementalIndex .setMaxRowCount(tuningConfig.getRowFlushBoundary()) .setMaxBytesInMemory(TuningConfigs.getMaxBytesInMemoryOrDefault(tuningConfig.getMaxBytesInMemory())) .buildOnheap(); if (oldDimOrder != null && !indexSchema.getDimensionsSpec().hasCustomDimensions()) { newIndex.loadDimensionIterable(oldDimOrder, oldCapabilities); } return newIndex; }
new HadoopIngestionSpec( new DataSchema( "foo", null, new AggregatorFactory[0], new UniformGranularitySpec( Assert.assertEquals(task.getDataSource(), task2.getDataSource()); Assert.assertEquals( task.getSpec().getTuningConfig().getJobProperties(), task2.getSpec().getTuningConfig().getJobProperties() ); Assert.assertEquals("blah", task.getClasspathPrefix());
final HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig.fromSpec( theSchema .withTuningConfig(theSchema.getTuningConfig().withVersion(version)) );
public InputRowParser getParser() { return schema.getDataSchema().getParser(); }
public String getWorkingPath() { final String workingPath = schema.getTuningConfig().getWorkingPath(); return workingPath == null ? DEFAULT_WORKING_PATH : workingPath; }
public String runTask(String[] args) throws Exception { final String schema = args[0]; final String workingPath = args[1]; final String segmentOutputPath = args[2]; final HadoopIngestionSpec theSchema = HadoopDruidIndexerConfig.JSON_MAPPER .readValue( schema, HadoopIngestionSpec.class ); final HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig.fromSpec( theSchema .withIOConfig(theSchema.getIOConfig().withSegmentOutputPath(segmentOutputPath)) .withTuningConfig(theSchema.getTuningConfig().withWorkingPath(workingPath)) ); job = new HadoopDruidDetermineConfigurationJob(config); log.info("Starting a hadoop determine configuration job..."); if (job.run()) { return HadoopDruidIndexerConfig.JSON_MAPPER.writeValueAsString( new HadoopDetermineConfigInnerProcessingStatus(config.getSchema(), job.getStats(), null) ); } else { return HadoopDruidIndexerConfig.JSON_MAPPER.writeValueAsString( new HadoopDetermineConfigInnerProcessingStatus(null, job.getStats(), job.getErrorMessage()) ); } }
int numBackgroundPersistThreads = config.getSchema().getTuningConfig().getNumBackgroundPersistThreads(); if (numBackgroundPersistThreads > 0) { final BlockingQueue<Runnable> queue = new SynchronousQueue<>(); final FileSystem outputFS = new Path(config.getSchema().getIOConfig().getSegmentOutputPath()) .getFileSystem(context.getConfiguration()); config.getDataSource(), interval, config.getSchema().getTuningConfig().getVersion(), null, ImmutableList.copyOf(allDimensionNames), mergedBase, JobHelper.makeFileNamePath( new Path(config.getSchema().getIOConfig().getSegmentOutputPath()), outputFS, segmentTemplate, ), JobHelper.makeFileNamePath( new Path(config.getSchema().getIOConfig().getSegmentOutputPath()), outputFS, segmentTemplate, ), JobHelper.makeTmpPath( new Path(config.getSchema().getIOConfig().getSegmentOutputPath()), outputFS,