public static IMRBatchCubingInputSide getBatchCubingInputSide(CubeSegment seg) { IJoinedFlatTableDesc flatDesc = EngineFactory.getJoinedFlatTableDesc(seg); return (IMRBatchCubingInputSide)SourceManager.createEngineAdapter(seg, IMRInput.class).getBatchCubingInputSide(flatDesc); }
public static ISparkBatchCubingInputSide getBatchCubingInputSide(CubeSegment seg) { IJoinedFlatTableDesc flatDesc = EngineFactory.getJoinedFlatTableDesc(seg); return (ISparkBatchCubingInputSide)SourceManager.createEngineAdapter(seg, ISparkInput.class).getBatchCubingInputSide(flatDesc); }
@Override protected void doSetup(Context context) throws IOException { super.bindCurrentConfiguration(context.getConfiguration()); cubeName = context.getConfiguration().get(BatchConstants.CFG_CUBE_NAME); segmentID = context.getConfiguration().get(BatchConstants.CFG_CUBE_SEGMENT_ID); final KylinConfig kylinConfig = AbstractHadoopJob.loadKylinPropsAndMetadata(); cube = CubeManager.getInstance(kylinConfig).getCube(cubeName); cubeDesc = cube.getDescriptor(); cubeSegment = cube.getSegmentById(segmentID); CubeJoinedFlatTableEnrich intermediateTableDesc = new CubeJoinedFlatTableEnrich(EngineFactory.getJoinedFlatTableDesc(cubeSegment), cubeDesc); Map<TblColRef, Dictionary<String>> dictionaryMap = DictionaryGetterUtil.getDictionaryMap(cubeSegment, context.getInputSplit(), context.getConfiguration()); baseCuboidBuilder = new BaseCuboidBuilder(kylinConfig, cubeDesc, cubeSegment, intermediateTableDesc, dictionaryMap); }
@Override protected void doSetup(Context context) throws IOException { Configuration conf = context.getConfiguration(); bindCurrentConfiguration(conf); config = AbstractHadoopJob.loadKylinPropsAndMetadata(); cubeName = conf.get(BatchConstants.CFG_CUBE_NAME); cube = CubeManager.getInstance(config).getCube(cubeName); cubeDesc = cube.getDescriptor(); cubeSeg = cube.getSegmentById(conf.get(BatchConstants.CFG_CUBE_SEGMENT_ID)); flatTableInputFormat = MRUtil.getBatchCubingInputSide(cubeSeg).getFlatTableInputFormat(); intermediateTableDesc = new CubeJoinedFlatTableEnrich(EngineFactory.getJoinedFlatTableDesc(cubeSeg), cubeDesc); globalColumns = cubeDesc.getAllGlobalDictColumns(); globalColumnIndex = new int[globalColumns.size()]; globalColumnValues = Lists.newArrayListWithExpectedSize(globalColumns.size()); for (int i = 0; i < globalColumns.size(); i++) { TblColRef colRef = globalColumns.get(i); int columnIndexOnFlatTbl = intermediateTableDesc.getColumnIndex(colRef); globalColumnIndex[i] = columnIndexOnFlatTbl; globalColumnValues.add(Sets.<String> newHashSet()); } splitKey = DictionaryGetterUtil.getInputSplitSignature(cubeSeg, context.getInputSplit()); }
@Override protected void doSetup(Context context) throws IOException { Configuration conf = context.getConfiguration(); bindCurrentConfiguration(conf); KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata(); cubeName = conf.get(BatchConstants.CFG_CUBE_NAME); cube = CubeManager.getInstance(config).getCube(cubeName); cubeSeg = cube.getSegmentById(conf.get(BatchConstants.CFG_CUBE_SEGMENT_ID)); cubeDesc = cube.getDescriptor(); baseCuboidId = Cuboid.getBaseCuboidId(cubeDesc); reducerMapping = new FactDistinctColumnsReducerMapping(cube); allCols = reducerMapping.getAllDimDictCols(); flatTableInputFormat = MRUtil.getBatchCubingInputSide(cubeSeg).getFlatTableInputFormat(); intermediateTableDesc = new CubeJoinedFlatTableEnrich(EngineFactory.getJoinedFlatTableDesc(cubeSeg), cubeDesc); columnIndex = new int[allCols.size()]; for (int i = 0; i < allCols.size(); i++) { TblColRef colRef = allCols.get(i); int columnIndexOnFlatTbl = intermediateTableDesc.getColumnIndex(colRef); columnIndex[i] = columnIndexOnFlatTbl; } }
String segmentID = context.getConfiguration().get(BatchConstants.CFG_CUBE_SEGMENT_ID); cubeSegment = cube.getSegmentById(segmentID); flatDesc = new CubeJoinedFlatTableEnrich(EngineFactory.getJoinedFlatTableDesc(cubeSegment), cubeDesc);
public void configureSparkJob(final CubeSegment seg, final SparkExecutable sparkExecutable, final String jobId, final String cuboidRootPath) { final IJoinedFlatTableDesc flatTableDesc = EngineFactory.getJoinedFlatTableDesc(seg); final String tablePath = JoinedFlatTable.getTableDir(flatTableDesc, getJobWorkingDir(jobId)); sparkExecutable.setParam(SparkCubingByLayer.OPTION_CUBE_NAME.getOpt(), seg.getRealization().getName()); sparkExecutable.setParam(SparkCubingByLayer.OPTION_SEGMENT_ID.getOpt(), seg.getUuid()); sparkExecutable.setParam(SparkCubingByLayer.OPTION_INPUT_TABLE.getOpt(), seg.getConfig().getHiveDatabaseForIntermediateTable() + "." + flatTableDesc.getTableName()); sparkExecutable.setParam(SparkCubingByLayer.OPTION_INPUT_PATH.getOpt(), tablePath); sparkExecutable.setParam(SparkCubingByLayer.OPTION_META_URL.getOpt(), getSegmentMetadataUrl(seg.getConfig(), jobId)); sparkExecutable.setParam(SparkCubingByLayer.OPTION_OUTPUT_PATH.getOpt(), cuboidRootPath); sparkExecutable.setJobId(jobId); StringBuilder jars = new StringBuilder(); StringUtil.appendWithSeparator(jars, seg.getConfig().getSparkAdditionalJars()); sparkExecutable.setJars(jars.toString()); sparkExecutable.setName(ExecutableConstants.STEP_NAME_BUILD_SPARK_CUBE); }
public SparkExecutable createFactDistinctColumnsSparkStep(String jobId) { final SparkExecutable sparkExecutable = new SparkExecutable(); final IJoinedFlatTableDesc flatTableDesc = EngineFactory.getJoinedFlatTableDesc(seg); final String tablePath = JoinedFlatTable.getTableDir(flatTableDesc, getJobWorkingDir(jobId)); sparkExecutable.setClassName(SparkFactDistinct.class.getName()); sparkExecutable.setParam(SparkFactDistinct.OPTION_CUBE_NAME.getOpt(), seg.getRealization().getName()); sparkExecutable.setParam(SparkFactDistinct.OPTION_META_URL.getOpt(), getSegmentMetadataUrl(seg.getConfig(), jobId)); sparkExecutable.setParam(SparkFactDistinct.OPTION_INPUT_TABLE.getOpt(), seg.getConfig().getHiveDatabaseForIntermediateTable() + "." + flatTableDesc.getTableName()); sparkExecutable.setParam(SparkFactDistinct.OPTION_INPUT_PATH.getOpt(), tablePath); sparkExecutable.setParam(SparkFactDistinct.OPTION_OUTPUT_PATH.getOpt(), getFactDistinctColumnsPath(jobId)); sparkExecutable.setParam(SparkFactDistinct.OPTION_SEGMENT_ID.getOpt(), seg.getUuid()); sparkExecutable.setParam(SparkFactDistinct.OPTION_STATS_SAMPLING_PERCENT.getOpt(), String.valueOf(config.getConfig().getCubingInMemSamplingPercent())); sparkExecutable.setJobId(jobId); sparkExecutable.setName(ExecutableConstants.STEP_NAME_FACT_DISTINCT_COLUMNS); sparkExecutable.setCounterSaveAs(CubingJob.SOURCE_RECORD_COUNT + "," + CubingJob.SOURCE_SIZE_BYTES, getCounterOuputPath(jobId)); StringBuilder jars = new StringBuilder(); StringUtil.appendWithSeparator(jars, seg.getConfig().getSparkAdditionalJars()); sparkExecutable.setJars(jars.toString()); return sparkExecutable; }
@Before public void setUp() throws Exception { this.createTestMetadata(); cube = CubeManager.getInstance(getTestConfig()).getCube("test_kylin_cube_with_slr_ready"); cubeSegment = cube.getSegments().get(0); flatTableDesc = EngineFactory.getJoinedFlatTableDesc(cubeSegment); }
@Override public Tuple2<ByteArray, Object[]> call(String[] rowArray) throws Exception { if (initialized == false) { synchronized (SparkCubingByLayer.class) { if (initialized == false) { KylinConfig kConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(conf, metaUrl); try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig .setAndUnsetThreadLocalConfig(kConfig)) { CubeInstance cubeInstance = CubeManager.getInstance(kConfig).getCube(cubeName); CubeDesc cubeDesc = cubeInstance.getDescriptor(); CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId); CubeJoinedFlatTableEnrich interDesc = new CubeJoinedFlatTableEnrich( EngineFactory.getJoinedFlatTableDesc(cubeSegment), cubeDesc); long baseCuboidId = Cuboid.getBaseCuboidId(cubeDesc); Cuboid baseCuboid = Cuboid.findForMandatory(cubeDesc, baseCuboidId); baseCuboidBuilder = new BaseCuboidBuilder(kConfig, cubeDesc, cubeSegment, interDesc, AbstractRowKeyEncoder.createInstance(cubeSegment, baseCuboid), MeasureIngester.create(cubeDesc.getMeasures()), cubeSegment.buildDictionaryMap()); initialized = true; } } } } baseCuboidBuilder.resetAggrs(); byte[] rowKey = baseCuboidBuilder.buildKey(rowArray); Object[] result = baseCuboidBuilder.buildValueObjects(rowArray); return new Tuple2<>(new ByteArray(rowKey), result); } }
private void init() { KylinConfig kConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(conf, metaUrl); try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig .setAndUnsetThreadLocalConfig(kConfig)) { CubeInstance cubeInstance = CubeManager.getInstance(kConfig).getCube(cubeName); CubeDesc cubeDesc = cubeInstance.getDescriptor(); CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId); CubeJoinedFlatTableEnrich intermediateTableDesc = new CubeJoinedFlatTableEnrich( EngineFactory.getJoinedFlatTableDesc(cubeSegment), cubeDesc); keyValueBuilder = new KeyValueBuilder(intermediateTableDesc); reducerMapping = new FactDistinctColumnsReducerMapping(cubeInstance); tmpbuf = ByteBuffer.allocate(4096); int[] rokeyColumnIndexes = intermediateTableDesc.getRowKeyColumnIndexes(); Long[] cuboidIds = getCuboidIds(cubeSegment); Integer[][] cuboidsBitSet = CuboidUtil.getCuboidBitSet(cuboidIds, rokeyColumnIndexes.length); boolean isNewAlgorithm = isUsePutRowKeyToHllNewAlgorithm(cubeDesc); HLLCounter[] cuboidsHLL = getInitCuboidsHLL(cuboidIds.length, cubeDesc.getConfig().getCubeStatsHLLPrecision()); cuboidStatCalculator = new CuboidStatCalculator(rokeyColumnIndexes, cuboidIds, cuboidsBitSet, isNewAlgorithm, cuboidsHLL); allCols = reducerMapping.getAllDimDictCols(); initDictColDeduper(cubeDesc); initColumnIndex(intermediateTableDesc); initialized = true; } }
public static IMRBatchCubingInputSide getBatchCubingInputSide(CubeSegment seg) { IJoinedFlatTableDesc flatDesc = EngineFactory.getJoinedFlatTableDesc(seg); return (IMRBatchCubingInputSide)SourceManager.createEngineAdapter(seg, IMRInput.class).getBatchCubingInputSide(flatDesc); }
public static ISparkBatchCubingInputSide getBatchCubingInputSide(CubeSegment seg) { IJoinedFlatTableDesc flatDesc = EngineFactory.getJoinedFlatTableDesc(seg); return (ISparkBatchCubingInputSide)SourceManager.createEngineAdapter(seg, ISparkInput.class).getBatchCubingInputSide(flatDesc); }
@Override public Map<Long, HyperLogLogPlusCounter> sampling(StreamingBatch streamingBatch) { final CubeManager cubeManager = CubeManager.getInstance(KylinConfig.getInstanceFromEnv()); final CubeInstance cubeInstance = cubeManager.reloadCubeLocal(cubeName); final IJoinedFlatTableDesc flatDesc = EngineFactory.getJoinedFlatTableDesc(cubeInstance.getDescriptor()); long start = System.currentTimeMillis(); final Map<Long, HyperLogLogPlusCounter> samplingResult = CubingUtils.sampling(cubeInstance.getDescriptor(), flatDesc, Lists.transform(streamingBatch.getMessages(), new Function<StreamingMessage, List<String>>() { @Nullable @Override public List<String> apply(@Nullable StreamingMessage input) { return input.getData(); } })); logger.info(String.format("sampling of %d messages cost %d ms", streamingBatch.getMessages().size(), (System.currentTimeMillis() - start))); return samplingResult; }
@Override protected void doSetup(Context context) throws IOException { super.bindCurrentConfiguration(context.getConfiguration()); cubeName = context.getConfiguration().get(BatchConstants.CFG_CUBE_NAME); segmentID = context.getConfiguration().get(BatchConstants.CFG_CUBE_SEGMENT_ID); final KylinConfig kylinConfig = AbstractHadoopJob.loadKylinPropsAndMetadata(); cube = CubeManager.getInstance(kylinConfig).getCube(cubeName); cubeDesc = cube.getDescriptor(); cubeSegment = cube.getSegmentById(segmentID); CubeJoinedFlatTableEnrich intermediateTableDesc = new CubeJoinedFlatTableEnrich(EngineFactory.getJoinedFlatTableDesc(cubeSegment), cubeDesc); Map<TblColRef, Dictionary<String>> dictionaryMap = DictionaryGetterUtil.getDictionaryMap(cubeSegment, context.getInputSplit(), context.getConfiguration()); baseCuboidBuilder = new BaseCuboidBuilder(kylinConfig, cubeDesc, cubeSegment, intermediateTableDesc, dictionaryMap); }
@Override protected void doSetup(Context context) throws IOException { Configuration conf = context.getConfiguration(); bindCurrentConfiguration(conf); config = AbstractHadoopJob.loadKylinPropsAndMetadata(); cubeName = conf.get(BatchConstants.CFG_CUBE_NAME); cube = CubeManager.getInstance(config).getCube(cubeName); cubeDesc = cube.getDescriptor(); cubeSeg = cube.getSegmentById(conf.get(BatchConstants.CFG_CUBE_SEGMENT_ID)); flatTableInputFormat = MRUtil.getBatchCubingInputSide(cubeSeg).getFlatTableInputFormat(); intermediateTableDesc = new CubeJoinedFlatTableEnrich(EngineFactory.getJoinedFlatTableDesc(cubeSeg), cubeDesc); globalColumns = cubeDesc.getAllGlobalDictColumns(); globalColumnIndex = new int[globalColumns.size()]; globalColumnValues = Lists.newArrayListWithExpectedSize(globalColumns.size()); for (int i = 0; i < globalColumns.size(); i++) { TblColRef colRef = globalColumns.get(i); int columnIndexOnFlatTbl = intermediateTableDesc.getColumnIndex(colRef); globalColumnIndex[i] = columnIndexOnFlatTbl; globalColumnValues.add(Sets.<String> newHashSet()); } splitKey = DictionaryGetterUtil.getInputSplitSignature(cubeSeg, context.getInputSplit()); }
@Override protected void doSetup(Context context) throws IOException { Configuration conf = context.getConfiguration(); bindCurrentConfiguration(conf); KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata(); cubeName = conf.get(BatchConstants.CFG_CUBE_NAME); cube = CubeManager.getInstance(config).getCube(cubeName); cubeSeg = cube.getSegmentById(conf.get(BatchConstants.CFG_CUBE_SEGMENT_ID)); cubeDesc = cube.getDescriptor(); baseCuboidId = Cuboid.getBaseCuboidId(cubeDesc); reducerMapping = new FactDistinctColumnsReducerMapping(cube); allCols = reducerMapping.getAllDimDictCols(); flatTableInputFormat = MRUtil.getBatchCubingInputSide(cubeSeg).getFlatTableInputFormat(); intermediateTableDesc = new CubeJoinedFlatTableEnrich(EngineFactory.getJoinedFlatTableDesc(cubeSeg), cubeDesc); columnIndex = new int[allCols.size()]; for (int i = 0; i < allCols.size(); i++) { TblColRef colRef = allCols.get(i); int columnIndexOnFlatTbl = intermediateTableDesc.getColumnIndex(colRef); columnIndex[i] = columnIndexOnFlatTbl; } }
public void configureSparkJob(final CubeSegment seg, final SparkExecutable sparkExecutable, final String jobId, final String cuboidRootPath) { final IJoinedFlatTableDesc flatTableDesc = EngineFactory.getJoinedFlatTableDesc(seg); final String tablePath = JoinedFlatTable.getTableDir(flatTableDesc, getJobWorkingDir(jobId)); sparkExecutable.setParam(SparkCubingByLayer.OPTION_CUBE_NAME.getOpt(), seg.getRealization().getName()); sparkExecutable.setParam(SparkCubingByLayer.OPTION_SEGMENT_ID.getOpt(), seg.getUuid()); sparkExecutable.setParam(SparkCubingByLayer.OPTION_INPUT_TABLE.getOpt(), seg.getConfig().getHiveDatabaseForIntermediateTable() + "." + flatTableDesc.getTableName()); sparkExecutable.setParam(SparkCubingByLayer.OPTION_INPUT_PATH.getOpt(), tablePath); sparkExecutable.setParam(SparkCubingByLayer.OPTION_META_URL.getOpt(), getSegmentMetadataUrl(seg.getConfig(), jobId)); sparkExecutable.setParam(SparkCubingByLayer.OPTION_OUTPUT_PATH.getOpt(), cuboidRootPath); sparkExecutable.setJobId(jobId); StringBuilder jars = new StringBuilder(); StringUtil.appendWithSeparator(jars, seg.getConfig().getSparkAdditionalJars()); sparkExecutable.setJars(jars.toString()); sparkExecutable.setName(ExecutableConstants.STEP_NAME_BUILD_SPARK_CUBE); }
public SparkExecutable createFactDistinctColumnsSparkStep(String jobId) { final SparkExecutable sparkExecutable = new SparkExecutable(); final IJoinedFlatTableDesc flatTableDesc = EngineFactory.getJoinedFlatTableDesc(seg); final String tablePath = JoinedFlatTable.getTableDir(flatTableDesc, getJobWorkingDir(jobId)); sparkExecutable.setClassName(SparkFactDistinct.class.getName()); sparkExecutable.setParam(SparkFactDistinct.OPTION_CUBE_NAME.getOpt(), seg.getRealization().getName()); sparkExecutable.setParam(SparkFactDistinct.OPTION_META_URL.getOpt(), getSegmentMetadataUrl(seg.getConfig(), jobId)); sparkExecutable.setParam(SparkFactDistinct.OPTION_INPUT_TABLE.getOpt(), seg.getConfig().getHiveDatabaseForIntermediateTable() + "." + flatTableDesc.getTableName()); sparkExecutable.setParam(SparkFactDistinct.OPTION_INPUT_PATH.getOpt(), tablePath); sparkExecutable.setParam(SparkFactDistinct.OPTION_OUTPUT_PATH.getOpt(), getFactDistinctColumnsPath(jobId)); sparkExecutable.setParam(SparkFactDistinct.OPTION_SEGMENT_ID.getOpt(), seg.getUuid()); sparkExecutable.setParam(SparkFactDistinct.OPTION_STATS_SAMPLING_PERCENT.getOpt(), String.valueOf(config.getConfig().getCubingInMemSamplingPercent())); sparkExecutable.setJobId(jobId); sparkExecutable.setName(ExecutableConstants.STEP_NAME_FACT_DISTINCT_COLUMNS); sparkExecutable.setCounterSaveAs(CubingJob.SOURCE_RECORD_COUNT + "," + CubingJob.SOURCE_SIZE_BYTES, getCounterOuputPath(jobId)); StringBuilder jars = new StringBuilder(); StringUtil.appendWithSeparator(jars, seg.getConfig().getSparkAdditionalJars()); sparkExecutable.setJars(jars.toString()); return sparkExecutable; }
private void init() { KylinConfig kConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(conf, metaUrl); try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig .setAndUnsetThreadLocalConfig(kConfig)) { CubeInstance cubeInstance = CubeManager.getInstance(kConfig).getCube(cubeName); CubeDesc cubeDesc = cubeInstance.getDescriptor(); CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId); CubeJoinedFlatTableEnrich intermediateTableDesc = new CubeJoinedFlatTableEnrich( EngineFactory.getJoinedFlatTableDesc(cubeSegment), cubeDesc); keyValueBuilder = new KeyValueBuilder(intermediateTableDesc); reducerMapping = new FactDistinctColumnsReducerMapping(cubeInstance); tmpbuf = ByteBuffer.allocate(4096); int[] rokeyColumnIndexes = intermediateTableDesc.getRowKeyColumnIndexes(); Long[] cuboidIds = getCuboidIds(cubeSegment); Integer[][] cuboidsBitSet = CuboidUtil.getCuboidBitSet(cuboidIds, rokeyColumnIndexes.length); boolean isNewAlgorithm = isUsePutRowKeyToHllNewAlgorithm(cubeDesc); HLLCounter[] cuboidsHLL = getInitCuboidsHLL(cuboidIds.length, cubeDesc.getConfig().getCubeStatsHLLPrecision()); cuboidStatCalculator = new CuboidStatCalculator(rokeyColumnIndexes, cuboidIds, cuboidsBitSet, isNewAlgorithm, cuboidsHLL); allCols = reducerMapping.getAllDimDictCols(); initDictColDeduper(cubeDesc); initColumnIndex(intermediateTableDesc); initialized = true; } }