private byte findSmallTable() { byte smallTablePos = -1; for (byte pos = 0; pos < mapJoinTables.length; pos++) { if (pos != conf.getPosBigTable()) { smallTablePos = pos; break; } } Preconditions.checkState(smallTablePos != -1); return smallTablePos; }
/** * Clean up data participating the join, i.e. in-mem and on-disk files for small table(s) and big table */ private void cleanupGraceHashJoin() { for (byte pos = 0; pos < mapJoinTables.length; pos++) { if (pos != conf.getPosBigTable()) { LOG.info("Cleaning up small table data at pos: " + pos); HybridHashTableContainer container = (HybridHashTableContainer) mapJoinTables[pos]; container.clear(); } } }
/** * Clean up data participating the join, i.e. in-mem and on-disk files for small table(s) and big table */ private void cleanupGraceHashJoin() { for (byte pos = 0; pos < mapJoinTables.length; pos++) { if (pos != conf.getPosBigTable()) { LOG.info("Cleaning up small table data at pos: " + pos); HybridHashTableContainer container = (HybridHashTableContainer) mapJoinTables[pos]; container.clear(); } } }
@Override public void initializeOp(Configuration hconf) throws HiveException { final int posBigTable = conf.getPosBigTable(); savePosBigTable = posBigTable; // We need a input object inspector that is for the row we will extract out of the // vectorized row batch, not for example, an original inspector for an ORC table, etc. inputObjInspectors[posBigTable] = VectorizedBatchUtil.convertToStandardStructObjectInspector((StructObjectInspector) inputObjInspectors[posBigTable]); // Call super VectorMapJoinOuterFilteredOperator, which calls super MapJoinOperator with // new input inspector. super.initializeOp(hconf); firstBatch = true; }
/** * Continue processing join between spilled hashtable(s) and spilled big table * @param partitionId the partition number across all small tables to process * @throws HiveException * @throws IOException * @throws SerDeException */ private void continueProcess(int partitionId) throws HiveException, IOException, SerDeException, ClassNotFoundException { for (byte pos = 0; pos < mapJoinTables.length; pos++) { if (pos != conf.getPosBigTable()) { LOG.info("Going to reload hash partition " + partitionId); reloadHashTable(pos, partitionId); } } reProcessBigTable(partitionId); }
/** * Continue processing join between spilled hashtable(s) and spilled big table * @param partitionId the partition number across all small tables to process * @throws HiveException * @throws IOException * @throws SerDeException */ private void continueProcess(int partitionId) throws HiveException, IOException, SerDeException, ClassNotFoundException { for (byte pos = 0; pos < mapJoinTables.length; pos++) { if (pos != conf.getPosBigTable()) { LOG.info("Going to reload hash partition " + partitionId); reloadHashTable(pos, partitionId); } } reProcessBigTable(partitionId); }
@Override public void initializeOp(Configuration hconf) throws HiveException { final int posBigTable = conf.getPosBigTable(); savePosBigTable = posBigTable; // We need a input object inspector that is for the row we will extract out of the // vectorized row batch, not for example, an original inspector for an ORC table, etc. inputObjInspectors[posBigTable] = VectorizedBatchUtil.convertToStandardStructObjectInspector((StructObjectInspector) inputObjInspectors[posBigTable]); // Call super VectorMapJoinOuterFilteredOperator, which calls super MapJoinOperator with // new input inspector. super.initializeOp(hconf); firstBatch = true; }
@Override public boolean isSame(OperatorDesc other) { if (super.isSame(other)) { MapJoinDesc otherDesc = (MapJoinDesc) other; return Objects.equals(getParentToInput(), otherDesc.getParentToInput()) && Objects.equals(getKeyCountsExplainDesc(), otherDesc.getKeyCountsExplainDesc()) && getPosBigTable() == otherDesc.getPosBigTable() && isBucketMapJoin() == otherDesc.isBucketMapJoin(); } return false; }
@Override protected List<ObjectInspector> getValueObjectInspectors( byte alias, List<ObjectInspector>[] aliasToObjectInspectors) { int[] valueIndex = conf.getValueIndex(alias); if (valueIndex == null) { return super.getValueObjectInspectors(alias, aliasToObjectInspectors); } List<ObjectInspector> inspectors = aliasToObjectInspectors[alias]; int bigPos = conf.getPosBigTable(); Converter[] converters = new Converter[valueIndex.length]; List<ObjectInspector> valueOI = new ArrayList<ObjectInspector>(); for (int i = 0; i < valueIndex.length; i++) { if (valueIndex[i] >= 0 && !joinKeysObjectInspectors[bigPos].isEmpty()) { if (conf.getNoOuterJoin()) { valueOI.add(joinKeysObjectInspectors[bigPos].get(valueIndex[i])); } else { // It is an outer join. We are going to add the inspector from the // inner side, but the key value will come from the outer side, so // we need to create a converter from inputOI to outputOI. valueOI.add(inspectors.get(i)); converters[i] = ObjectInspectorConverters.getConverter( joinKeysObjectInspectors[bigPos].get(valueIndex[i]), inspectors.get(i)); } } else { valueOI.add(inspectors.get(i)); } } unwrapContainer[alias] = new UnwrapRowContainer(alias, valueIndex, converters, hasFilter(alias)); return valueOI; }
private Boolean findGrandChildSubqueryMapjoin(MapJoinWalkerCtx ctx, MapJoinOperator mapJoin) { Operator<? extends OperatorDesc> parent = mapJoin; while (true) { if (parent.getChildOperators() == null || parent.getChildOperators().size() != 1) { return null; } Operator<? extends OperatorDesc> ch = parent.getChildOperators().get(0); if (ch instanceof MapJoinOperator) { if (!nonSubqueryMapJoin((MapJoinOperator) ch, mapJoin)) { if (ch.getParentOperators().indexOf(parent) == ((MapJoinOperator) ch).getConf() .getPosBigTable()) { // not come from the local branch return true; } } return false; // not from a sub-query. } if ((ch instanceof JoinOperator) || (ch instanceof UnionOperator) || (ch instanceof ReduceSinkOperator) || (ch instanceof LateralViewJoinOperator) || (ch instanceof GroupByOperator) || (ch instanceof ScriptOperator)) { return null; } parent = ch; } }
public static <T> Set<T> findOperatorsUpstreamJoinAccounted(Operator<?> start, Class<T> clazz, Set<T> found) { if (clazz.isInstance(start)) { found.add((T) start); } int onlyIncludeIndex = -1; if (start instanceof AbstractMapJoinOperator) { AbstractMapJoinOperator mapJoinOp = (AbstractMapJoinOperator) start; MapJoinDesc desc = (MapJoinDesc) mapJoinOp.getConf(); onlyIncludeIndex = desc.getPosBigTable(); } if (start.getParentOperators() != null) { int i = 0; for (Operator<?> parent : start.getParentOperators()) { if (onlyIncludeIndex >= 0) { if (onlyIncludeIndex == i) { findOperatorsUpstreamJoinAccounted(parent, clazz, found); } } else { findOperatorsUpstreamJoinAccounted(parent, clazz, found); } i++; } } return found; }
public VectorMapJoinBaseOperator(CompilationOpContext ctx, VectorizationContext vContext, OperatorDesc conf) throws HiveException { super(ctx); MapJoinDesc desc = (MapJoinDesc) conf; this.conf = desc; order = desc.getTagOrder(); numAliases = desc.getExprs().size(); posBigTable = (byte) desc.getPosBigTable(); filterMaps = desc.getFilterMap(); noOuterJoin = desc.isNoOuterJoin(); // We are making a new output vectorized row batch. vOutContext = new VectorizationContext(getName(), desc.getOutputColumnNames(), /* vContextEnvironment */ vContext); }
public VectorMapJoinBaseOperator(CompilationOpContext ctx, OperatorDesc conf, VectorizationContext vContext, VectorDesc vectorDesc) throws HiveException { super(ctx); MapJoinDesc desc = (MapJoinDesc) conf; this.conf = desc; this.vContext = vContext; this.vectorDesc = (VectorMapJoinDesc) vectorDesc; order = desc.getTagOrder(); numAliases = desc.getExprs().size(); posBigTable = (byte) desc.getPosBigTable(); filterMaps = desc.getFilterMap(); noOuterJoin = desc.isNoOuterJoin(); // We are making a new output vectorized row batch. vOutContext = new VectorizationContext(getName(), desc.getOutputColumnNames(), /* vContextEnvironment */ vContext); vOutContext.setInitialTypeInfos(Arrays.asList(getOutputTypeInfos(desc))); }
private boolean validateMapJoinDesc(MapJoinDesc desc) { byte posBigTable = (byte) desc.getPosBigTable(); List<ExprNodeDesc> filterExprs = desc.getFilters().get(posBigTable); if (!validateExprNodeDesc( filterExprs, "Filter", VectorExpressionDescriptor.Mode.FILTER, /* allowComplex */ true)) { return false; } List<ExprNodeDesc> keyExprs = desc.getKeys().get(posBigTable); if (!validateExprNodeDesc(keyExprs, "Key")) { return false; } List<ExprNodeDesc> valueExprs = desc.getExprs().get(posBigTable); if (!validateExprNodeDesc(valueExprs, "Value")) { return false; } Byte[] order = desc.getTagOrder(); Byte posSingleVectorMapJoinSmallTable = (order[0] == posBigTable ? order[1] : order[0]); List<ExprNodeDesc> smallTableExprs = desc.getExprs().get(posSingleVectorMapJoinSmallTable); if (!validateExprNodeDesc(smallTableExprs, "Small Table")) { return false; } if (desc.getResidualFilterExprs() != null && !desc.getResidualFilterExprs().isEmpty()) { setOperatorIssue("Non-equi joins not supported"); return false; } return true; }
if (idx == getConf().getPosBigTable()) { continue;
/** * Iterate over the big table row container and feed process() with leftover rows * @param partitionId the partition from which to take out spilled big table rows * @throws HiveException */ protected void reProcessBigTable(int partitionId) throws HiveException { // For binary join, firstSmallTable is the only small table; it has reference to spilled big // table rows; // For n-way join, since we only spill once, when processing the first small table, so only the // firstSmallTable has reference to the spilled big table rows. HashPartition partition = firstSmallTable.getHashPartitions()[partitionId]; ObjectContainer bigTable = partition.getMatchfileObjContainer(); LOG.info("Hybrid Grace Hash Join: Going to process spilled big table rows in partition " + partitionId + ". Number of rows: " + bigTable.size()); while (bigTable.hasNext()) { Object row = bigTable.next(); process(row, conf.getPosBigTable()); } bigTable.clear(); }
/** * Iterate over the big table row container and feed process() with leftover rows * @param partitionId the partition from which to take out spilled big table rows * @throws HiveException */ protected void reProcessBigTable(int partitionId) throws HiveException { // For binary join, firstSmallTable is the only small table; it has reference to spilled big // table rows; // For n-way join, since we only spill once, when processing the first small table, so only the // firstSmallTable has reference to the spilled big table rows. HashPartition partition = firstSmallTable.getHashPartitions()[partitionId]; ObjectContainer bigTable = partition.getMatchfileObjContainer(); LOG.info("Hybrid Grace Hash Join: Going to process spilled big table rows in partition " + partitionId + ". Number of rows: " + bigTable.size()); while (bigTable.hasNext()) { Object row = bigTable.next(); process(row, conf.getPosBigTable()); } bigTable.clear(); }
@Override @SuppressWarnings("unchecked") protected void initializeOp(Configuration hconf) throws HiveException { if (conf.getGenJoinKeys()) { int tagLen = conf.getTagLength(); joinKeys = new List[tagLen]; JoinUtil.populateJoinKeyValue(joinKeys, conf.getKeys(), NOTSKIPBIGTABLE, hconf); joinKeysObjectInspectors = JoinUtil.getObjectInspectorsFromEvaluators(joinKeys, inputObjInspectors,NOTSKIPBIGTABLE, tagLen); } super.initializeOp(hconf); numMapRowsRead = 0; // all other tables are small, and are cached in the hash table posBigTable = (byte) conf.getPosBigTable(); emptyList = new RowContainer<List<Object>>(1, hconf, reporter); RowContainer<List<Object>> bigPosRC = JoinUtil.getRowContainer(hconf, rowContainerStandardObjectInspectors[posBigTable], posBigTable, joinCacheSize,spillTableDesc, conf, !hasFilter(posBigTable), reporter); storage[posBigTable] = bigPosRC; }
@Override @SuppressWarnings("unchecked") protected void initializeOp(Configuration hconf) throws HiveException { if (conf.getGenJoinKeys()) { int tagLen = conf.getTagLength(); joinKeys = new List[tagLen]; JoinUtil.populateJoinKeyValue(joinKeys, conf.getKeys(), NOTSKIPBIGTABLE, hconf); joinKeysObjectInspectors = JoinUtil.getObjectInspectorsFromEvaluators(joinKeys, inputObjInspectors,NOTSKIPBIGTABLE, tagLen); } super.initializeOp(hconf); numMapRowsRead = 0; // all other tables are small, and are cached in the hash table posBigTable = (byte) conf.getPosBigTable(); emptyList = new RowContainer<List<Object>>(1, hconf, reporter); RowContainer<List<Object>> bigPosRC = JoinUtil.getRowContainer(hconf, rowContainerStandardObjectInspectors[posBigTable], posBigTable, joinCacheSize,spillTableDesc, conf, !hasFilter(posBigTable), reporter); storage[posBigTable] = bigPosRC; }
public HashTableSinkDesc(MapJoinDesc clone) { this.bigKeysDirMap = clone.getBigKeysDirMap(); this.conds = clone.getConds(); this.exprs = new HashMap<Byte, List<ExprNodeDesc>>(clone.getExprs()); this.handleSkewJoin = clone.getHandleSkewJoin(); this.keyTableDesc = clone.getKeyTableDesc(); this.noOuterJoin = clone.getNoOuterJoin(); this.outputColumnNames = clone.getOutputColumnNames(); this.reversedExprs = clone.getReversedExprs(); this.skewKeyDefinition = clone.getSkewKeyDefinition(); this.skewKeysValuesTables = clone.getSkewKeysValuesTables(); this.smallKeysDirMap = clone.getSmallKeysDirMap(); this.tagOrder = clone.getTagOrder(); this.filters = new HashMap<Byte, List<ExprNodeDesc>>(clone.getFilters()); this.filterMap = clone.getFilterMap(); this.keys = new HashMap<Byte, List<ExprNodeDesc>>(clone.getKeys()); this.keyTblDesc = clone.getKeyTblDesc(); this.valueTblDescs = clone.getValueTblDescs(); this.valueTblFilteredDescs = clone.getValueFilteredTblDescs(); this.posBigTable = clone.getPosBigTable(); this.retainList = clone.getRetainList(); this.dumpFilePrefix = clone.getDumpFilePrefix(); this.bucketMapjoinContext = new BucketMapJoinContext(clone); this.hashtableMemoryUsage = clone.getHashTableMemoryUsage(); }