DictionaryInfo largestDictInfo = findLargestDictInfo(newDictInfo); if (largestDictInfo != null) { largestDictInfo = getDictionaryInfo(largestDictInfo.getResourcePath()); Dictionary<String> largestDictObject = largestDictInfo.getDictionaryObject(); if (largestDictObject.contains(newDict)) { logger.info("dictionary content " + newDict + ", is contained by dictionary at " + largestDictInfo.getResourcePath()); return largestDictInfo; } else if (newDict.contains(largestDictObject)) {
if (dict.getDictionaryClass().equals(AppendTrieDictionary.class.getName())) { return dict; firstDictInfo = info; } else { if (!firstDictInfo.isDictOnSameColumn(info)) { logger.warn("Merging dictionaries are not structurally equal : " + firstDictInfo.getResourcePath() + " and " + info.getResourcePath()); totalSize += info.getInput().getSize(); DictionaryInfo newDictInfo = new DictionaryInfo(firstDictInfo); TableSignature signature = newDictInfo.getInput(); signature.setSize(totalSize); signature.setLastModifiedTime(System.currentTimeMillis()); if (!dicts.get(0).getDictionaryObject().equals(dicts.get(i).getDictionaryObject())) { identicalSourceDicts = false; break; return dicts.get(0); } else { Dictionary<String> newDict = DictionaryGenerator.mergeDictionaries(DataType.getType(newDictInfo.getDataType()), dicts); return trySaveNewDict(newDict, newDictInfo);
public static void dump(File f) throws IOException { if (f.isDirectory()) { File[] files = f.listFiles(); if (files == null) { return; } for (File c : files) dump(c); return; } if (f.getName().endsWith(".dict")) { DictionaryInfoSerializer ser = new DictionaryInfoSerializer(); DictionaryInfo dictInfo = ser.deserialize(new DataInputStream(new FileInputStream(f))); System.out.println("============================================================================"); System.out.println("File: " + f.getAbsolutePath()); System.out.println(new Date(dictInfo.getLastModified())); System.out.println(JsonUtil.writeValueAsIndentString(dictInfo)); dictInfo.getDictionaryObject().dump(System.out); System.out.println(); } } }
private DictionaryInfo findLargestDictInfo(DictionaryInfo dictInfo) throws IOException { final ResourceStore store = getStore(); final List<DictionaryInfo> allResources = store.getAllResources(dictInfo.getResourceDir(), DictionaryInfoSerializer.INFO_SERIALIZER); DictionaryInfo largestDict = null; for (DictionaryInfo dictionaryInfo : allResources) { if (largestDict == null) { largestDict = dictionaryInfo; continue; } if (largestDict.getCardinality() < dictionaryInfo.getCardinality()) { largestDict = dictionaryInfo; } } return largestDict; }
public void removeDictionaries(String srcTable, String srcCol) throws IOException { DictionaryInfo info = new DictionaryInfo(); info.setSourceTable(srcTable); info.setSourceColumn(srcCol); ResourceStore store = getStore(); NavigableSet<String> existings = store.listResources(info.getResourceDir()); if (existings == null) return; for (String existing : existings) removeDictionary(existing); }
private String checkDupByInfo(DictionaryInfo dictInfo) throws IOException { final ResourceStore store = getStore(); final List<DictionaryInfo> allResources = store.getAllResources(dictInfo.getResourceDir(), DictionaryInfoSerializer.INFO_SERIALIZER); TableSignature input = dictInfo.getInput(); for (DictionaryInfo dictionaryInfo : allResources) { if (input.equals(dictionaryInfo.getInput())) { return dictionaryInfo.getResourcePath(); } } return null; }
DictionaryInfo dictSrc = srcDicMgr.getDictionaryInfo(item); long ts = dictSrc.getLastModified(); dictSrc.setLastModified(0);//to avoid resource store write conflict Dictionary dictObj = dictSrc.getDictionaryObject().copyToAnotherMeta(srcConfig, dstConfig); DictionaryInfo dictSaved = dstDictMgr.trySaveNewDict(dictObj, dictSrc); dictSrc.setLastModified(ts); for (Map.Entry<String, String> entry : segment.getDictionaries().entrySet()) { if (entry.getValue().equalsIgnoreCase(item)) { entry.setValue(dictSaved.getResourcePath()); logger.info("Item " + item + " is dup, instead " + dictSaved.getResourcePath() + " is reused");
public MultipleDictionaryValueEnumerator(DataType dataType, List<DictionaryInfo> dictionaryInfoList) { this.dataType = dataType; dictionaryList = Lists.newArrayListWithCapacity(dictionaryInfoList.size()); for (DictionaryInfo dictInfo : dictionaryInfoList) { Dictionary<String> dictionary = (Dictionary<String>) dictInfo.getDictionaryObject(); dictionaryList.add((Dictionary<String>) dictInfo.getDictionaryObject()); curKeys.add(dictionary.getMinId()); } }
void save(DictionaryInfo dict) throws IOException { ResourceStore store = getStore(); String path = dict.getResourcePath(); logger.info("Saving dictionary at " + path); store.putBigResource(path, dict, System.currentTimeMillis(), DictionaryInfoSerializer.FULL_SERIALIZER); }
@SuppressWarnings("unchecked") public static Map<TblColRef, Dictionary<String>> writeDictionary(CubeSegment cubeSegment, Map<TblColRef, Dictionary<String>> dictionaryMap, long startOffset, long endOffset) { Map<TblColRef, Dictionary<String>> realDictMap = Maps.newHashMap(); for (Map.Entry<TblColRef, Dictionary<String>> entry : dictionaryMap.entrySet()) { final TblColRef tblColRef = entry.getKey(); final Dictionary<String> dictionary = entry.getValue(); IReadableTable.TableSignature signature = new IReadableTable.TableSignature(); signature.setLastModifiedTime(System.currentTimeMillis()); signature.setPath(String.format(Locale.ROOT, "streaming_%s_%s", startOffset, endOffset)); signature.setSize(endOffset - startOffset); DictionaryInfo dictInfo = new DictionaryInfo(tblColRef.getColumnDesc(), tblColRef.getDatatype(), signature); logger.info("writing dictionary for TblColRef:" + tblColRef.toString()); DictionaryManager dictionaryManager = DictionaryManager.getInstance(cubeSegment.getCubeDesc().getConfig()); try { DictionaryInfo realDict = dictionaryManager.trySaveNewDict(dictionary, dictInfo); cubeSegment.putDictResPath(tblColRef, realDict.getResourcePath()); realDictMap.put(tblColRef, (Dictionary<String>) realDict.getDictionaryObject()); } catch (IOException e) { throw new RuntimeException("error save dictionary for column:" + tblColRef, e); } } return realDictMap; }
firstDictInfo = info; } else { if (!firstDictInfo.isDictOnSameColumn(info)) { logger.warn("Merging dictionaries are not structurally equal : " + firstDictInfo.getResourcePath() + " and " + info.getResourcePath()); totalSize += info.getInput().getSize(); DictionaryInfo newDictInfo = new DictionaryInfo(firstDictInfo); TableSignature signature = newDictInfo.getInput(); signature.setSize(totalSize); signature.setLastModifiedTime(System.currentTimeMillis()); logger.info("Identical dictionary input " + newDictInfo.getInput() + ", reuse existing dictionary at " + dupDict); return getDictionaryInfo(dupDict); if (!dicts.get(0).getDictionaryObject().equals(dicts.get(i).getDictionaryObject())) { identicalSourceDicts = false; break;
private String checkDupByContent(DictionaryInfo dictInfo, Dictionary<String> dict) throws IOException { ResourceStore store = getStore(); NavigableSet<String> existings = store.listResources(dictInfo.getResourceDir()); if (existings == null) return null; logger.info("{} existing dictionaries of the same column", existings.size()); if (existings.size() > 100) { logger.warn("Too many dictionaries under {}, dict count: {}", dictInfo.getResourceDir(), existings.size()); } for (String existing : existings) { DictionaryInfo existingInfo = getDictionaryInfo(existing); if (existingInfo != null) { if ((config.isDictResuable() && existingInfo.getDictionaryObject().contains(dict)) || dict.equals(existingInfo.getDictionaryObject())) { return existing; } } } return null; }
@Test @Ignore("hive not ready") public void basic() throws Exception { CubeDesc cubeDesc = CubeDescManager.getInstance(getTestConfig()).getCubeDesc("test_kylin_cube_without_slr_desc"); TblColRef col = cubeDesc.findColumnRef("DEFAULT.TEST_CATEGORY_GROUPINGS", "META_CATEG_NAME"); DictionaryInfo info1 = dictMgr.buildDictionary(cubeDesc.getModel(), cubeDesc.getRowkey().getDictionary(col), col, null); System.out.println(JsonUtil.writeValueAsIndentString(info1)); DictionaryInfo info2 = dictMgr.buildDictionary(cubeDesc.getModel(), cubeDesc.getRowkey().getDictionary(col), col, null); System.out.println(JsonUtil.writeValueAsIndentString(info2)); assertTrue(info1.getUuid() == info2.getUuid()); assertTrue(info1 == dictMgr.getDictionaryInfo(info1.getResourcePath())); assertTrue(info2 == dictMgr.getDictionaryInfo(info2.getResourcePath())); assertTrue(info1.getDictionaryObject() == info2.getDictionaryObject()); touchDictValues(info1); }
for (String dictPath : segment.getDictionaryPaths()) { DictionaryInfo dictInfo = store.getResource(dictPath, DictionaryInfoSerializer.FULL_SERIALIZER); if ("org.apache.kylin.dict.AppendTrieDictionary".equals(dictInfo != null ? dictInfo.getDictionaryClass() : null)){ String dictObj = dictInfo.getDictionaryObject().toString(); String basedir = dictObj.substring(dictObj.indexOf("(") + 1, dictObj.indexOf(")") - 1); if (basedir.startsWith(KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory() + "/resources/GlobalDict")) { activeResources.add(KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory() + "resources/GlobalDict" + dictInfo.getResourceDir()); } else if (basedir.startsWith(KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory() + "/resources/SegmentDict")) { activeResources.add(KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory() + "resources/SegmentDict" + dictInfo.getResourceDir());
public DictionaryInfo trySaveNewDict(Dictionary<?> newDict, DictionaryInfo newDictInfo) throws IOException { String dupDict = checkDupByContent(newDictInfo, newDict); if (dupDict != null) { logger.info("Identical dictionary content " + newDict + ", reuse existing dictionary at " + dupDict); return getDictionaryInfo(dupDict); } newDictInfo.setCardinality(newDict.getSize()); newDictInfo.setDictionaryObject(newDict); newDictInfo.setDictionaryClass(newDict.getClass().getName()); save(newDictInfo); dictCache.put(newDictInfo.getResourcePath(), newDictInfo); return newDictInfo; }
signature.setPath("fake_dict_for" + lfn.getName() + segment.getName()); DictionaryInfo newDictInfo = new DictionaryInfo(lfn.getTable(), lfn.getColumn().getName(), lfn.getColumn().getZeroBasedIndex(), "string", signature, ""); ((TrieDictionary) dict).dump(System.out); segment.putDictResPath(lfn, newDictInfo.getResourcePath()); segment.putDictResPath(lsi, sharedDict.getResourcePath()); segment.putDictResPath(ssc, sharedDict.getResourcePath());
public DictionaryInfo trySaveNewDict(Dictionary<?> newDict, DictionaryInfo newDictInfo) throws IOException { String dupDict = checkDupByContent(newDictInfo, newDict); if (dupDict != null) { logger.info("Identical dictionary content " + newDict + ", reuse existing dictionary at " + dupDict); return getDictionaryInfo(dupDict); } newDictInfo.setDictionaryObject(newDict); newDictInfo.setDictionaryClass(newDict.getClass().getName()); save(newDictInfo); dictCache.put(newDictInfo.getResourcePath(), newDictInfo); return newDictInfo; }
@SuppressWarnings("unchecked") private void touchDictValues(DictionaryInfo info1) { Dictionary<String> dict = (Dictionary<String>) info1.getDictionaryObject(); HashSet<String> set = new HashSet<String>(); for (int i = 0, n = info1.getCardinality(); i < n; i++) { set.add(dict.getValueFromId(i)); } assertEquals(info1.getCardinality(), set.size()); } }
@Override public void init(DictionaryInfo dictInfo, int baseId, String hdfsDir) throws IOException { sourceColumn = dictInfo.getSourceTable() + "." + dictInfo.getSourceColumn(); KylinConfig config = KylinConfig.getInstanceFromEnv(); int maxEntriesPerSlice = config.getAppendDictEntrySize(); if (hdfsDir == null) { //build in Kylin job server hdfsDir = KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory(); } //use UUID to make each segment dict in different HDFS dir and support concurrent build //use timestamp to make the segment dict easily to delete String baseDir = hdfsDir + "resources/SegmentDict" + dictInfo.getResourceDir() + "/" + RandomUtil.randomUUID().toString() + "_" + System.currentTimeMillis() + "/"; this.builder = new AppendTrieDictionaryBuilder(baseDir, maxEntriesPerSlice, false); this.baseId = baseId; }
private DictionaryInfo createDictionaryInfo(TblColRef col, IReadableTable inpTable) throws IOException { TableSignature inputSig = inpTable.getSignature(); if (inputSig == null) // table does not exists throw new IllegalStateException("Input table does not exist: " + inpTable); DictionaryInfo dictInfo = new DictionaryInfo(col.getColumnDesc(), col.getDatatype(), inputSig); return dictInfo; }