public void setExternalDataPaths(List<String> externalDataPaths) { setParam("externalDataPaths", StringUtil.join(externalDataPaths, ",")); }
public void setIntermediateTables(List<String> tableIdentity) { setParam("oldHiveTables", StringUtil.join(tableIdentity, ",")); }
@Override public void doMap(LongWritable key, BytesWritable value, Context context) throws IOException, InterruptedException { ByteBuffer buffer = ByteBuffer.wrap(value.getBytes(), 0, value.getLength()); StreamingMessageRow row = streamingParser.parse(buffer).get(0); if (row == null) { throw new IllegalArgumentException(""); } data = StringUtil.join(row.getData(), delimiter); // output this row to value outValue.set(Bytes.toBytes(data)); context.write(outKey, outValue); } }
/** * Read the given path as a Java RDD; The path can have second level sub folder. * @param inputPath * @param fs * @param sc * @param keyClass * @param valueClass * @return * @throws IOException */ public static JavaPairRDD parseInputPath(String inputPath, FileSystem fs, JavaSparkContext sc, Class keyClass, Class valueClass) throws IOException { List<String> inputFolders = Lists.newArrayList(); Path inputHDFSPath = new Path(inputPath); FileStatus[] fileStatuses = fs.listStatus(inputHDFSPath); boolean hasDir = false; for (FileStatus stat : fileStatuses) { if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) { hasDir = true; inputFolders.add(stat.getPath().toString()); } } if (!hasDir) { return sc.sequenceFile(inputHDFSPath.toString(), keyClass, valueClass); } return sc.sequenceFile(StringUtil.join(inputFolders, ","), keyClass, valueClass); }
@Override public void addStepPhase4_Cleanup(DefaultChainedExecutable jobFlow) { final String jobWorkingDir = getJobWorkingDir(jobFlow, hdfsWorkingDir); org.apache.kylin.source.hive.GarbageCollectionStep step = new org.apache.kylin.source.hive.GarbageCollectionStep(); step.setName(ExecutableConstants.STEP_NAME_HIVE_CLEANUP); step.setIntermediateTables(Collections.singletonList(getIntermediateTableIdentity())); step.setExternalDataPaths(Collections.singletonList(JoinedFlatTable.getTableDir(flatDesc, jobWorkingDir))); step.setHiveViewIntermediateTableIdentities(StringUtil.join(hiveViewIntermediateTables, ",")); jobFlow.addTask(step); }
public SparkExecutable createMergeCuboidDataStep(CubeSegment seg, List<CubeSegment> mergingSegments, String jobID) { final List<String> mergingCuboidPaths = Lists.newArrayList(); for (CubeSegment merging : mergingSegments) { mergingCuboidPaths.add(getCuboidRootPath(merging)); } String formattedPath = StringUtil.join(mergingCuboidPaths, ","); String outputPath = getCuboidRootPath(jobID); final SparkExecutable sparkExecutable = new SparkExecutable(); sparkExecutable.setClassName(SparkCubingMerge.class.getName()); sparkExecutable.setParam(SparkCubingMerge.OPTION_CUBE_NAME.getOpt(), seg.getRealization().getName()); sparkExecutable.setParam(SparkCubingMerge.OPTION_SEGMENT_ID.getOpt(), seg.getUuid()); sparkExecutable.setParam(SparkCubingMerge.OPTION_INPUT_PATH.getOpt(), formattedPath); sparkExecutable.setParam(SparkCubingMerge.OPTION_META_URL.getOpt(), getSegmentMetadataUrl(seg.getConfig(), jobID)); sparkExecutable.setParam(SparkCubingMerge.OPTION_OUTPUT_PATH.getOpt(), outputPath); sparkExecutable.setJobId(jobID); sparkExecutable.setName(ExecutableConstants.STEP_NAME_MERGE_CUBOID); StringBuilder jars = new StringBuilder(); StringUtil.appendWithSeparator(jars, seg.getConfig().getSparkAdditionalJars()); sparkExecutable.setJars(jars.toString()); return sparkExecutable; } }
public SparkExecutable createMergeDictionaryStep(CubeSegment seg, String jobID, List<String> mergingSegmentIds) { final SparkExecutable sparkExecutable = new SparkExecutable(); sparkExecutable.setClassName(SparkMergingDictionary.class.getName()); sparkExecutable.setParam(SparkMergingDictionary.OPTION_CUBE_NAME.getOpt(), seg.getRealization().getName()); sparkExecutable.setParam(SparkMergingDictionary.OPTION_SEGMENT_ID.getOpt(), seg.getUuid()); sparkExecutable.setParam(SparkMergingDictionary.OPTION_META_URL.getOpt(), getSegmentMetadataUrl(seg.getConfig(), jobID)); sparkExecutable.setParam(SparkMergingDictionary.OPTION_MERGE_SEGMENT_IDS.getOpt(), StringUtil.join(mergingSegmentIds, ",")); sparkExecutable.setParam(SparkMergingDictionary.OPTION_OUTPUT_PATH_DICT.getOpt(), getDictInfoPath(jobID)); sparkExecutable.setParam(SparkMergingDictionary.OPTION_OUTPUT_PATH_STAT.getOpt(), getStatisticsPath(jobID)); sparkExecutable.setJobId(jobID); sparkExecutable.setName(ExecutableConstants.STEP_NAME_MERGE_DICTIONARY); sparkExecutable.setSparkConfigName(ExecutableConstants.SPARK_SPECIFIC_CONFIG_NAME_MERGE_DICTIONARY); StringBuilder jars = new StringBuilder(); StringUtil.appendWithSeparator(jars, seg.getConfig().getSparkAdditionalJars()); sparkExecutable.setJars(jars.toString()); return sparkExecutable; }
public MapReduceExecutable createMergeDictionaryStep(CubeSegment seg, String jobID, List<String> mergingSegmentIds) { MapReduceExecutable mergeDictionaryStep = new MapReduceExecutable(); mergeDictionaryStep.setName(ExecutableConstants.STEP_NAME_MERGE_DICTIONARY); StringBuilder cmd = new StringBuilder(); appendMapReduceParameters(cmd, JobEngineConfig.IN_MEM_JOB_CONF_SUFFIX); appendExecCmdParameters(cmd, BatchConstants.ARG_CUBE_NAME, seg.getCubeInstance().getName()); appendExecCmdParameters(cmd, BatchConstants.ARG_SEGMENT_ID, seg.getUuid()); appendExecCmdParameters(cmd, BatchConstants.ARG_META_URL, getSegmentMetadataUrl(seg.getConfig(), jobID)); appendExecCmdParameters(cmd, MergeDictionaryJob.OPTION_MERGE_SEGMENT_IDS.getOpt(), StringUtil.join(mergingSegmentIds, ",")); appendExecCmdParameters(cmd, MergeDictionaryJob.OPTION_OUTPUT_PATH_DICT.getOpt(), getDictInfoPath(jobID)); appendExecCmdParameters(cmd, MergeDictionaryJob.OPTION_OUTPUT_PATH_STAT.getOpt(), getStatisticsPath(jobID)); appendExecCmdParameters(cmd, BatchConstants.ARG_JOB_NAME, "Kylin_Merge_Dictionary_" + seg.getCubeInstance().getName() + "_Step"); mergeDictionaryStep.setMapReduceParams(cmd.toString()); mergeDictionaryStep.setMapReduceJobClass(MergeDictionaryJob.class); return mergeDictionaryStep; }
public MapReduceExecutable createMergeCuboidDataStep(CubeSegment seg, List<CubeSegment> mergingSegments, String jobID, Class<? extends AbstractHadoopJob> clazz) { final List<String> mergingCuboidPaths = Lists.newArrayList(); for (CubeSegment merging : mergingSegments) { mergingCuboidPaths.add(getCuboidRootPath(merging) + "*"); } String formattedPath = StringUtil.join(mergingCuboidPaths, ","); String outputPath = getCuboidRootPath(jobID); MapReduceExecutable mergeCuboidDataStep = new MapReduceExecutable(); mergeCuboidDataStep.setName(ExecutableConstants.STEP_NAME_MERGE_CUBOID); StringBuilder cmd = new StringBuilder(); appendMapReduceParameters(cmd); appendExecCmdParameters(cmd, BatchConstants.ARG_CUBE_NAME, seg.getCubeInstance().getName()); appendExecCmdParameters(cmd, BatchConstants.ARG_SEGMENT_ID, seg.getUuid()); appendExecCmdParameters(cmd, BatchConstants.ARG_INPUT, formattedPath); appendExecCmdParameters(cmd, BatchConstants.ARG_OUTPUT, outputPath); appendExecCmdParameters(cmd, BatchConstants.ARG_JOB_NAME, "Kylin_Merge_Cuboid_" + seg.getCubeInstance().getName() + "_Step"); mergeCuboidDataStep.setMapReduceParams(cmd.toString()); mergeCuboidDataStep.setMapReduceJobClass(clazz); return mergeCuboidDataStep; }
public void setIntermediateTables(List<String> tableIdentity) { setParam("oldHiveTables", StringUtil.join(tableIdentity, ",")); }
public void setExternalDataPaths(List<String> externalDataPaths) { setParam("externalDataPaths", StringUtil.join(externalDataPaths, ",")); }
@Override public void doMap(LongWritable key, BytesWritable value, Context context) throws IOException, InterruptedException { ByteBuffer buffer = ByteBuffer.wrap(value.getBytes(), 0, value.getLength()); StreamingMessageRow row = streamingParser.parse(buffer).get(0); if (row == null) { throw new IllegalArgumentException(""); } data = StringUtil.join(row.getData(), delimiter); // output this row to value outValue.set(Bytes.toBytes(data)); context.write(outKey, outValue); } }
/** * Read the given path as a Java RDD; The path can have second level sub folder. * @param inputPath * @param fs * @param sc * @param keyClass * @param valueClass * @return * @throws IOException */ public static JavaPairRDD parseInputPath(String inputPath, FileSystem fs, JavaSparkContext sc, Class keyClass, Class valueClass) throws IOException { List<String> inputFolders = Lists.newArrayList(); Path inputHDFSPath = new Path(inputPath); FileStatus[] fileStatuses = fs.listStatus(inputHDFSPath); boolean hasDir = false; for (FileStatus stat : fileStatuses) { if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) { hasDir = true; inputFolders.add(stat.getPath().toString()); } } if (!hasDir) { return sc.sequenceFile(inputHDFSPath.toString(), keyClass, valueClass); } return sc.sequenceFile(StringUtil.join(inputFolders, ","), keyClass, valueClass); }
@Override public void addStepPhase4_Cleanup(DefaultChainedExecutable jobFlow) { final String jobWorkingDir = getJobWorkingDir(jobFlow, hdfsWorkingDir); org.apache.kylin.source.hive.GarbageCollectionStep step = new org.apache.kylin.source.hive.GarbageCollectionStep(); step.setName(ExecutableConstants.STEP_NAME_HIVE_CLEANUP); step.setIntermediateTables(Collections.singletonList(getIntermediateTableIdentity())); step.setExternalDataPaths(Collections.singletonList(JoinedFlatTable.getTableDir(flatDesc, jobWorkingDir))); step.setHiveViewIntermediateTableIdentities(StringUtil.join(hiveViewIntermediateTables, ",")); jobFlow.addTask(step); }
public SparkExecutable createMergeCuboidDataStep(CubeSegment seg, List<CubeSegment> mergingSegments, String jobID) { final List<String> mergingCuboidPaths = Lists.newArrayList(); for (CubeSegment merging : mergingSegments) { mergingCuboidPaths.add(getCuboidRootPath(merging)); } String formattedPath = StringUtil.join(mergingCuboidPaths, ","); String outputPath = getCuboidRootPath(jobID); final SparkExecutable sparkExecutable = new SparkExecutable(); sparkExecutable.setClassName(SparkCubingMerge.class.getName()); sparkExecutable.setParam(SparkCubingMerge.OPTION_CUBE_NAME.getOpt(), seg.getRealization().getName()); sparkExecutable.setParam(SparkCubingMerge.OPTION_SEGMENT_ID.getOpt(), seg.getUuid()); sparkExecutable.setParam(SparkCubingMerge.OPTION_INPUT_PATH.getOpt(), formattedPath); sparkExecutable.setParam(SparkCubingMerge.OPTION_META_URL.getOpt(), getSegmentMetadataUrl(seg.getConfig(), jobID)); sparkExecutable.setParam(SparkCubingMerge.OPTION_OUTPUT_PATH.getOpt(), outputPath); sparkExecutable.setJobId(jobID); sparkExecutable.setName(ExecutableConstants.STEP_NAME_MERGE_CUBOID); StringBuilder jars = new StringBuilder(); StringUtil.appendWithSeparator(jars, seg.getConfig().getSparkAdditionalJars()); sparkExecutable.setJars(jars.toString()); return sparkExecutable; } }
public SparkExecutable createMergeDictionaryStep(CubeSegment seg, String jobID, List<String> mergingSegmentIds) { final SparkExecutable sparkExecutable = new SparkExecutable(); sparkExecutable.setClassName(SparkMergingDictionary.class.getName()); sparkExecutable.setParam(SparkMergingDictionary.OPTION_CUBE_NAME.getOpt(), seg.getRealization().getName()); sparkExecutable.setParam(SparkMergingDictionary.OPTION_SEGMENT_ID.getOpt(), seg.getUuid()); sparkExecutable.setParam(SparkMergingDictionary.OPTION_META_URL.getOpt(), getSegmentMetadataUrl(seg.getConfig(), jobID)); sparkExecutable.setParam(SparkMergingDictionary.OPTION_MERGE_SEGMENT_IDS.getOpt(), StringUtil.join(mergingSegmentIds, ",")); sparkExecutable.setParam(SparkMergingDictionary.OPTION_OUTPUT_PATH_DICT.getOpt(), getDictInfoPath(jobID)); sparkExecutable.setParam(SparkMergingDictionary.OPTION_OUTPUT_PATH_STAT.getOpt(), getStatisticsPath(jobID)); sparkExecutable.setJobId(jobID); sparkExecutable.setName(ExecutableConstants.STEP_NAME_MERGE_DICTIONARY); sparkExecutable.setSparkConfigName(ExecutableConstants.SPARK_SPECIFIC_CONFIG_NAME_MERGE_DICTIONARY); StringBuilder jars = new StringBuilder(); StringUtil.appendWithSeparator(jars, seg.getConfig().getSparkAdditionalJars()); sparkExecutable.setJars(jars.toString()); return sparkExecutable; }
public MapReduceExecutable createMergeDictionaryStep(CubeSegment seg, String jobID, List<String> mergingSegmentIds) { MapReduceExecutable mergeDictionaryStep = new MapReduceExecutable(); mergeDictionaryStep.setName(ExecutableConstants.STEP_NAME_MERGE_DICTIONARY); StringBuilder cmd = new StringBuilder(); appendMapReduceParameters(cmd, JobEngineConfig.IN_MEM_JOB_CONF_SUFFIX); appendExecCmdParameters(cmd, BatchConstants.ARG_CUBE_NAME, seg.getCubeInstance().getName()); appendExecCmdParameters(cmd, BatchConstants.ARG_SEGMENT_ID, seg.getUuid()); appendExecCmdParameters(cmd, BatchConstants.ARG_META_URL, getSegmentMetadataUrl(seg.getConfig(), jobID)); appendExecCmdParameters(cmd, MergeDictionaryJob.OPTION_MERGE_SEGMENT_IDS.getOpt(), StringUtil.join(mergingSegmentIds, ",")); appendExecCmdParameters(cmd, MergeDictionaryJob.OPTION_OUTPUT_PATH_DICT.getOpt(), getDictInfoPath(jobID)); appendExecCmdParameters(cmd, MergeDictionaryJob.OPTION_OUTPUT_PATH_STAT.getOpt(), getStatisticsPath(jobID)); appendExecCmdParameters(cmd, BatchConstants.ARG_JOB_NAME, "Kylin_Merge_Dictionary_" + seg.getCubeInstance().getName() + "_Step"); mergeDictionaryStep.setMapReduceParams(cmd.toString()); mergeDictionaryStep.setMapReduceJobClass(MergeDictionaryJob.class); return mergeDictionaryStep; }
public MapReduceExecutable createMergeCuboidDataStep(CubeSegment seg, List<CubeSegment> mergingSegments, String jobID, Class<? extends AbstractHadoopJob> clazz) { final List<String> mergingCuboidPaths = Lists.newArrayList(); for (CubeSegment merging : mergingSegments) { mergingCuboidPaths.add(getCuboidRootPath(merging) + "*"); } String formattedPath = StringUtil.join(mergingCuboidPaths, ","); String outputPath = getCuboidRootPath(jobID); MapReduceExecutable mergeCuboidDataStep = new MapReduceExecutable(); mergeCuboidDataStep.setName(ExecutableConstants.STEP_NAME_MERGE_CUBOID); StringBuilder cmd = new StringBuilder(); appendMapReduceParameters(cmd); appendExecCmdParameters(cmd, BatchConstants.ARG_CUBE_NAME, seg.getCubeInstance().getName()); appendExecCmdParameters(cmd, BatchConstants.ARG_SEGMENT_ID, seg.getUuid()); appendExecCmdParameters(cmd, BatchConstants.ARG_INPUT, formattedPath); appendExecCmdParameters(cmd, BatchConstants.ARG_OUTPUT, outputPath); appendExecCmdParameters(cmd, BatchConstants.ARG_JOB_NAME, "Kylin_Merge_Cuboid_" + seg.getCubeInstance().getName() + "_Step"); mergeCuboidDataStep.setMapReduceParams(cmd.toString()); mergeCuboidDataStep.setMapReduceJobClass(clazz); return mergeCuboidDataStep; }