void init(EsInputSplit esSplit, Configuration cfg, Progressable progressable) { // get a copy to override the host/port Settings settings = HadoopSettingsManager.loadFrom(cfg).copy().load(esSplit.getPartition().getSerializedSettings()); if (log.isTraceEnabled()) { log.trace(String.format("Init shard reader from cfg %s", HadoopCfgUtils.asProperties(cfg))); log.trace(String.format("Init shard reader w/ settings %s", settings)); } this.esSplit = esSplit; // initialize mapping/ scroll reader InitializationUtils.setValueReaderIfNotSet(settings, WritableValueReader.class, log); PartitionDefinition part = esSplit.getPartition(); PartitionReader partitionReader = RestService.createReader(settings, part, log); this.scrollReader = partitionReader.scrollReader; this.client = partitionReader.client; this.queryBuilder = partitionReader.queryBuilder; this.progressable = progressable; // in Hadoop-like envs (Spark) the progressable might be null and thus the heart-beat is not needed if (progressable != null) { beat = new HeartBeat(progressable, cfg, settings.getHeartBeatLead(), log); } if (log.isDebugEnabled()) { log.debug(String.format("Initializing RecordReader for [%s]", esSplit)); } }
@Override public void sinkConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf) { conf.setOutputFormat(EsOutputFormat.class); // define an output dir to prevent Cascading from setting up a TempHfs and overriding the OutputFormat Settings set = loadSettings(conf, false); Log log = LogFactory.getLog(EsTap.class); InitializationUtils.setValueWriterIfNotSet(set, CascadingValueWriter.class, log); InitializationUtils.setValueReaderIfNotSet(set, JdkValueReader.class, log); InitializationUtils.setBytesConverterIfNeeded(set, CascadingLocalBytesConverter.class, log); InitializationUtils.setFieldExtractorIfNotSet(set, CascadingFieldExtractor.class, log); // NB: we need to set this property even though it is not being used - and since and URI causes problem, use only the resource/file //conf.set("mapred.output.dir", set.getTargetUri() + "/" + set.getTargetResource()); HadoopCfgUtils.setFileOutputFormatDir(conf, set.getResourceWrite()); HadoopCfgUtils.setOutputCommitterClass(conf, EsOutputFormat.EsOldAPIOutputCommitter.class.getName()); if (log.isTraceEnabled()) { log.trace("Initialized (sink) configuration " + HadoopCfgUtils.asProperties(conf)); } }
@Override public FileSplit[] getSplits(JobConf job, int numSplits) throws IOException { // first, merge input table properties (since there's no access to them ...) Settings settings = HadoopSettingsManager.loadFrom(job); //settings.merge(IOUtils.propsFromString(settings.getProperty(HiveConstants.INPUT_TBL_PROPERTIES))); Log log = LogFactory.getLog(getClass()); // move on to initialization InitializationUtils.setValueReaderIfNotSet(settings, HiveValueReader.class, log); if (settings.getOutputAsJson() == false) { // Only set the fields if we aren't asking for raw JSON settings.setProperty(InternalConfigurationOptions.INTERNAL_ES_TARGET_FIELDS, StringUtils.concatenate(HiveUtils.columnToAlias(settings), ",")); } HiveUtils.init(settings, log); // decorate original splits as FileSplit InputSplit[] shardSplits = super.getSplits(job, numSplits); FileSplit[] wrappers = new FileSplit[shardSplits.length]; Path path = new Path(job.get(HiveConstants.TABLE_LOCATION)); for (int i = 0; i < wrappers.length; i++) { wrappers[i] = new EsHiveSplit(shardSplits[i], path); } return wrappers; }
@Override public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) { this.collector = collector; LinkedHashMap copy = new LinkedHashMap(conf); copy.putAll(spoutConfig); StormSettings settings = new StormSettings(copy); InitializationUtils.setValueReaderIfNotSet(settings, JdkValueReader.class, log); ackReads = settings.getStormSpoutReliable(); if (ackReads) { inTransitQueue = new LinkedHashMap<Object, Object>(); replayQueue = new LinkedList<Object[]>(); retries = new HashMap<Object, Integer>(); queueSize = settings.getStormSpoutReliableQueueSize(); tupleRetries = settings.getStormSpoutReliableRetriesPerTuple(); tupleFailure = settings.getStormSpoutReliableTupleFailureHandling(); } int totalTasks = context.getComponentTasks(context.getThisComponentId()).size(); int currentTask = context.getThisTaskIndex(); // match the partitions based on the current topology List<PartitionDefinition> partitions = RestService.findPartitions(settings, log); List<PartitionDefinition> assigned = RestService.assignPartitions(partitions, currentTask, totalTasks); iterator = RestService.multiReader(settings, assigned, log); }
private void init(String location, Job job, boolean read) { Settings settings = HadoopSettingsManager.loadFrom(job.getConfiguration()).merge(properties); settings = (read ? settings.setResourceRead(location) : settings.setResourceWrite(location)); InitializationUtils.checkIdForOperation(settings); InitializationUtils.setValueWriterIfNotSet(settings, PigValueWriter.class, log); InitializationUtils.setValueReaderIfNotSet(settings, PigValueReader.class, log); InitializationUtils.setBytesConverterIfNeeded(settings, PigBytesConverter.class, log); InitializationUtils.setFieldExtractorIfNotSet(settings, PigFieldExtractor.class, log); isJSON = settings.getOutputAsJson(); }
static Settings addDefaultsToSettings(Properties flowProperties, Properties tapProperties, Log log) { Settings settings = HadoopSettingsManager.loadFrom(CascadingUtils.extractOriginalProperties(flowProperties)).merge(tapProperties); InitializationUtils.validateSettings(settings); InitializationUtils.setValueWriterIfNotSet(settings, CascadingValueWriter.class, log); InitializationUtils.setValueReaderIfNotSet(settings, JdkValueReader.class, log); InitializationUtils.setBytesConverterIfNeeded(settings, CascadingLocalBytesConverter.class, log); InitializationUtils.setFieldExtractorIfNotSet(settings, CascadingFieldExtractor.class, log); return settings; }
void init(ShardInputSplit esSplit, Configuration cfg, Progressable progressable) { // get a copy to override the host/port Settings settings = HadoopSettingsManager.loadFrom(cfg).copy().load(esSplit.settings); if (log.isTraceEnabled()) { log.trace(String.format("Init shard reader from cfg %s", HadoopCfgUtils.asProperties(cfg))); log.trace(String.format("Init shard reader w/ settings %s", esSplit.settings)); } this.esSplit = esSplit; // initialize mapping/ scroll reader InitializationUtils.setValueReaderIfNotSet(settings, WritableValueReader.class, log); PartitionDefinition part = new PartitionDefinition(esSplit.nodeIp, esSplit.httpPort, esSplit.nodeName, esSplit.nodeId, esSplit.shardId, esSplit.onlyNode, settings.save(), esSplit.mapping); PartitionReader partitionReader = RestService.createReader(settings, part, log); this.scrollReader = partitionReader.scrollReader; this.client = partitionReader.client; this.queryBuilder = partitionReader.queryBuilder; this.progressable = progressable; // in Hadoop-like envs (Spark) the progressable might be null and thus the heart-beat is not needed if (progressable != null) { beat = new HeartBeat(progressable, cfg, settings.getHeartBeatLead(), log); } if (log.isDebugEnabled()) { log.debug(String.format("Initializing RecordReader for [%s]", esSplit)); } }
void init(EsInputSplit esSplit, Configuration cfg, Progressable progressable) { // get a copy to override the host/port Settings settings = HadoopSettingsManager.loadFrom(cfg).copy().load(esSplit.getPartition().getSerializedSettings()); if (log.isTraceEnabled()) { log.trace(String.format("Init shard reader from cfg %s", HadoopCfgUtils.asProperties(cfg))); log.trace(String.format("Init shard reader w/ settings %s", settings)); } this.esSplit = esSplit; // initialize mapping/ scroll reader InitializationUtils.setValueReaderIfNotSet(settings, WritableValueReader.class, log); PartitionDefinition part = esSplit.getPartition(); PartitionReader partitionReader = RestService.createReader(settings, part, log); this.scrollReader = partitionReader.scrollReader; this.client = partitionReader.client; this.queryBuilder = partitionReader.queryBuilder; this.progressable = progressable; // in Hadoop-like envs (Spark) the progressable might be null and thus the heart-beat is not needed if (progressable != null) { beat = new HeartBeat(progressable, cfg, settings.getHeartBeatLead(), log); } if (log.isDebugEnabled()) { log.debug(String.format("Initializing RecordReader for [%s]", esSplit)); } }
void init(EsInputSplit esSplit, Configuration cfg, Progressable progressable) { // get a copy to override the host/port Settings settings = HadoopSettingsManager.loadFrom(cfg).copy().load(esSplit.getPartition().getSerializedSettings()); if (log.isTraceEnabled()) { log.trace(String.format("Init shard reader from cfg %s", HadoopCfgUtils.asProperties(cfg))); log.trace(String.format("Init shard reader w/ settings %s", settings)); } this.esSplit = esSplit; // initialize mapping/ scroll reader InitializationUtils.setValueReaderIfNotSet(settings, WritableValueReader.class, log); PartitionDefinition part = esSplit.getPartition(); PartitionReader partitionReader = RestService.createReader(settings, part, log); this.scrollReader = partitionReader.scrollReader; this.client = partitionReader.client; this.queryBuilder = partitionReader.queryBuilder; this.progressable = progressable; // in Hadoop-like envs (Spark) the progressable might be null and thus the heart-beat is not needed if (progressable != null) { beat = new HeartBeat(progressable, cfg, settings.getHeartBeatLead(), log); } if (log.isDebugEnabled()) { log.debug(String.format("Initializing RecordReader for [%s]", esSplit)); } }
void init(EsInputSplit esSplit, Configuration cfg, Progressable progressable) { // get a copy to override the host/port Settings settings = HadoopSettingsManager.loadFrom(cfg).copy().load(esSplit.getPartition().getSerializedSettings()); if (log.isTraceEnabled()) { log.trace(String.format("Init shard reader from cfg %s", HadoopCfgUtils.asProperties(cfg))); log.trace(String.format("Init shard reader w/ settings %s", settings)); } this.esSplit = esSplit; // initialize mapping/ scroll reader InitializationUtils.setValueReaderIfNotSet(settings, WritableValueReader.class, log); PartitionDefinition part = esSplit.getPartition(); PartitionReader partitionReader = RestService.createReader(settings, part, log); this.scrollReader = partitionReader.scrollReader; this.client = partitionReader.client; this.queryBuilder = partitionReader.queryBuilder; this.progressable = progressable; // in Hadoop-like envs (Spark) the progressable might be null and thus the heart-beat is not needed if (progressable != null) { beat = new HeartBeat(progressable, cfg, settings.getHeartBeatLead(), log); } if (log.isDebugEnabled()) { log.debug(String.format("Initializing RecordReader for [%s]", esSplit)); } }
@Override public FileSplit[] getSplits(JobConf job, int numSplits) throws IOException { // first, merge input table properties (since there's no access to them ...) Settings settings = HadoopSettingsManager.loadFrom(job); //settings.merge(IOUtils.propsFromString(settings.getProperty(HiveConstants.INPUT_TBL_PROPERTIES))); Log log = LogFactory.getLog(getClass()); // move on to initialization InitializationUtils.setValueReaderIfNotSet(settings, HiveValueReader.class, log); if (settings.getOutputAsJson() == false) { // Only set the fields if we aren't asking for raw JSON settings.setProperty(InternalConfigurationOptions.INTERNAL_ES_TARGET_FIELDS, StringUtils.concatenate(HiveUtils.columnToAlias(settings), ",")); } HiveUtils.init(settings, log); // decorate original splits as FileSplit InputSplit[] shardSplits = super.getSplits(job, numSplits); FileSplit[] wrappers = new FileSplit[shardSplits.length]; Path path = new Path(job.get(HiveConstants.TABLE_LOCATION)); for (int i = 0; i < wrappers.length; i++) { wrappers[i] = new EsHiveSplit(shardSplits[i], path); } return wrappers; }
@Override public void sinkConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf) { conf.setOutputFormat(EsOutputFormat.class); // define an output dir to prevent Cascading from setting up a TempHfs and overriding the OutputFormat Settings set = loadSettings(conf, false); Log log = LogFactory.getLog(EsTap.class); InitializationUtils.setValueWriterIfNotSet(set, CascadingValueWriter.class, log); InitializationUtils.setValueReaderIfNotSet(set, JdkValueReader.class, log); InitializationUtils.setBytesConverterIfNeeded(set, CascadingLocalBytesConverter.class, log); InitializationUtils.setFieldExtractorIfNotSet(set, CascadingFieldExtractor.class, log); // NB: we need to set this property even though it is not being used - and since and URI causes problem, use only the resource/file //conf.set("mapred.output.dir", set.getTargetUri() + "/" + set.getTargetResource()); HadoopCfgUtils.setFileOutputFormatDir(conf, set.getResourceWrite()); HadoopCfgUtils.setOutputCommitterClass(conf, EsOutputFormat.EsOldAPIOutputCommitter.class.getName()); if (log.isTraceEnabled()) { log.trace("Initialized (sink) configuration " + HadoopCfgUtils.asProperties(conf)); } }
@Override public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) { this.collector = collector; LinkedHashMap copy = new LinkedHashMap(conf); copy.putAll(spoutConfig); StormSettings settings = new StormSettings(copy); InitializationUtils.setValueReaderIfNotSet(settings, JdkValueReader.class, log); ackReads = settings.getStormSpoutReliable(); if (ackReads) { inTransitQueue = new LinkedHashMap<Object, Object>(); replayQueue = new LinkedList<Object[]>(); retries = new HashMap<Object, Integer>(); queueSize = settings.getStormSpoutReliableQueueSize(); tupleRetries = settings.getStormSpoutReliableRetriesPerTuple(); tupleFailure = settings.getStormSpoutReliableTupleFailureHandling(); } int totalTasks = context.getComponentTasks(context.getThisComponentId()).size(); int currentTask = context.getThisTaskIndex(); // match the partitions based on the current topology List<PartitionDefinition> partitions = RestService.findPartitions(settings, log); List<PartitionDefinition> assigned = RestService.assignPartitions(partitions, currentTask, totalTasks); iterator = RestService.multiReader(settings, assigned, log); }
private void init(String location, Job job, boolean read) { Settings settings = HadoopSettingsManager.loadFrom(job.getConfiguration()).merge(properties); settings = (read ? settings.setResourceRead(location) : settings.setResourceWrite(location)); InitializationUtils.checkIdForOperation(settings); InitializationUtils.setValueWriterIfNotSet(settings, PigValueWriter.class, log); InitializationUtils.setValueReaderIfNotSet(settings, PigValueReader.class, log); InitializationUtils.setBytesConverterIfNeeded(settings, PigBytesConverter.class, log); InitializationUtils.setFieldExtractorIfNotSet(settings, PigFieldExtractor.class, log); isJSON = settings.getOutputAsJson(); }
static Settings addDefaultsToSettings(Properties flowProperties, Properties tapProperties, Log log) { Settings settings = HadoopSettingsManager.loadFrom(CascadingUtils.extractOriginalProperties(flowProperties)).merge(tapProperties); InitializationUtils.validateSettings(settings); InitializationUtils.setValueWriterIfNotSet(settings, CascadingValueWriter.class, log); InitializationUtils.setValueReaderIfNotSet(settings, JdkValueReader.class, log); InitializationUtils.setBytesConverterIfNeeded(settings, CascadingLocalBytesConverter.class, log); InitializationUtils.setFieldExtractorIfNotSet(settings, CascadingFieldExtractor.class, log); return settings; }