@Override public void init(DictionaryInfo dictInfo, int baseId, String hdfsDir) throws IOException { sourceColumn = dictInfo.getSourceTable() + "." + dictInfo.getSourceColumn(); KylinConfig config = KylinConfig.getInstanceFromEnv(); int maxEntriesPerSlice = config.getAppendDictEntrySize(); if (hdfsDir == null) { //build in Kylin job server hdfsDir = KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory(); } //use UUID to make each segment dict in different HDFS dir and support concurrent build //use timestamp to make the segment dict easily to delete String baseDir = hdfsDir + "resources/SegmentDict" + dictInfo.getResourceDir() + "/" + RandomUtil.randomUUID().toString() + "_" + System.currentTimeMillis() + "/"; this.builder = new AppendTrieDictionaryBuilder(baseDir, maxEntriesPerSlice, false); this.baseId = baseId; }
@Override public void init(DictionaryInfo dictInfo, int baseId, String hdfsDir) throws IOException { sourceColumn = dictInfo.getSourceTable() + "_" + dictInfo.getSourceColumn(); lock = KylinConfig.getInstanceFromEnv().getDistributedLockFactory().lockForCurrentThread(); lock.lock(getLockPath(sourceColumn), Long.MAX_VALUE); int maxEntriesPerSlice = KylinConfig.getInstanceFromEnv().getAppendDictEntrySize(); if (hdfsDir == null) { //build in Kylin job server hdfsDir = KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory(); } String baseDir = hdfsDir + "resources/GlobalDict" + dictInfo.getResourceDir() + "/"; try { this.builder = new AppendTrieDictionaryBuilder(baseDir, maxEntriesPerSlice, true); } catch (Throwable e) { lock.unlock(getLockPath(sourceColumn)); throw new RuntimeException( String.format(Locale.ROOT, "Failed to create global dictionary on %s ", sourceColumn), e); } this.baseId = baseId; }
public static Dictionary<?> buildDictionaryFromValueList(DictionaryInfo info, List<byte[]> values) { info.setCardinality(values.size()); Dictionary dict = null; int baseId = 0; // always 0 for now int nSamples = 5; ArrayList samples = new ArrayList(); // build dict, case by data type DataType dataType = DataType.getInstance(info.getDataType()); if (dataType.isDateTimeFamily()) dict = buildDateStrDict(values, baseId, nSamples, samples); else if (dataType.isNumberFamily()) dict = buildNumberDict(values, baseId, nSamples, samples); else dict = buildStringDict(values, baseId, nSamples, samples); // log a few samples StringBuilder buf = new StringBuilder(); for (Object s : samples) { if (buf.length() > 0) buf.append(", "); buf.append(s.toString()).append("=>").append(dict.getIdFromValue(s)); } logger.info("Dictionary value samples: " + buf.toString()); logger.info("Dictionary cardinality " + info.getCardinality()); if (values.size() > DICT_MAX_CARDINALITY) throw new IllegalArgumentException("Too high cardinality is not suitable for dictionary -- " + info.getSourceTable() + "." + info.getSourceColumn() + " cardinality: " + values.size()); return dict; }
public static org.apache.kylin.common.util.Dictionary<?> buildDictionaryFromValueEnumerator(DictionaryInfo info, IDictionaryValueEnumerator valueEnumerator) throws IOException{ org.apache.kylin.common.util.Dictionary dict = null; int baseId = 0; // always 0 for now final int nSamples = 5; ArrayList samples = Lists.newArrayListWithCapacity(nSamples); // build dict, case by data type DataType dataType = DataType.getInstance(info.getDataType()); if (dataType.isDateTimeFamily()) dict = buildDateStrDict(valueEnumerator, baseId, nSamples, samples); else if (dataType.isNumberFamily()) dict = buildNumberDict(valueEnumerator, baseId, nSamples, samples); else dict = buildStringDict(valueEnumerator, baseId, nSamples, samples); // log a few samples StringBuilder buf = new StringBuilder(); for (Object s : samples) { if (buf.length() > 0) buf.append(", "); buf.append(s.toString()).append("=>").append(dict.getIdFromValue(s)); } logger.info("Dictionary value samples: " + buf.toString()); logger.info("Dictionary cardinality: " + dict.getSize()); if (dict instanceof TrieDictionary && dict.getSize() > DICT_MAX_CARDINALITY) throw new IllegalArgumentException("Too high cardinality is not suitable for dictionary -- " + info.getSourceTable() + "." + info.getSourceColumn() + " cardinality: " + dict.getSize()); return dict; }
@Override public void init(DictionaryInfo dictInfo, int baseId, String hdfsDir) throws IOException { sourceColumn = dictInfo.getSourceTable() + "." + dictInfo.getSourceColumn(); KylinConfig config = KylinConfig.getInstanceFromEnv(); int maxEntriesPerSlice = config.getAppendDictEntrySize(); if (hdfsDir == null) { //build in Kylin job server hdfsDir = KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory(); } //use UUID to make each segment dict in different HDFS dir and support concurrent build //use timestamp to make the segment dict easily to delete String baseDir = hdfsDir + "resources/SegmentDict" + dictInfo.getResourceDir() + "/" + RandomUtil.randomUUID().toString() + "_" + System.currentTimeMillis() + "/"; this.builder = new AppendTrieDictionaryBuilder(baseDir, maxEntriesPerSlice, false); this.baseId = baseId; }
@Override public void init(DictionaryInfo dictInfo, int baseId, String hdfsDir) throws IOException { sourceColumn = dictInfo.getSourceTable() + "_" + dictInfo.getSourceColumn(); lock = KylinConfig.getInstanceFromEnv().getDistributedLockFactory().lockForCurrentThread(); lock.lock(getLockPath(sourceColumn), Long.MAX_VALUE); int maxEntriesPerSlice = KylinConfig.getInstanceFromEnv().getAppendDictEntrySize(); if (hdfsDir == null) { //build in Kylin job server hdfsDir = KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory(); } String baseDir = hdfsDir + "resources/GlobalDict" + dictInfo.getResourceDir() + "/"; try { this.builder = new AppendTrieDictionaryBuilder(baseDir, maxEntriesPerSlice, true); } catch (Throwable e) { lock.unlock(getLockPath(sourceColumn)); throw new RuntimeException( String.format(Locale.ROOT, "Failed to create global dictionary on %s ", sourceColumn), e); } this.baseId = baseId; }