Map<Path, ArrayList<String>> sourcePathToAliases = source.getPathToAliases(); Map<Path, PartitionDesc> sourcePathToPartitionInfo = source.getPathToPartitionInfo(); Map<String, Operator<? extends OperatorDesc>> sourceAliasToWork = source.getAliasToWork(); Map<String, PartitionDesc> sourceAliasToPartnInfo = source.getAliasToPartnInfo(); LinkedHashMap<Path, ArrayList<String>> targetPathToAliases = target.getPathToAliases(); LinkedHashMap<Path, PartitionDesc> targetPathToPartitionInfo = target.getPathToPartitionInfo(); Map<String, Operator<? extends OperatorDesc>> targetAliasToWork = target.getAliasToWork(); Map<String, PartitionDesc> targetAliasToPartnInfo = target.getAliasToPartnInfo(); target.setPathToAliases(targetPathToAliases); target.setPathToPartitionInfo(targetPathToPartitionInfo);
Iterator<Path> it = work.getPathToPartitionInfo().keySet().iterator(); while (it.hasNext()) { Path p = it.next(); PartitionDesc desc = work.getPathToPartitionInfo().get(p); Map<String, String> spec = desc.getPartSpec(); if (spec == null) { LOG.info("Pruning path: " + p); it.remove(); work.removePathToAlias(p); work.getPartitionDescs().remove(desc);
@Test public void testGetAndSetConsistency() { MapWork mw = new MapWork(); LinkedHashMap<Path, ArrayList<String>> pathToAliases = new LinkedHashMap<>(); pathToAliases.put(new Path("p0"), Lists.newArrayList("a1", "a2")); mw.setPathToAliases(pathToAliases); LinkedHashMap<Path, ArrayList<String>> pta = mw.getPathToAliases(); assertEquals(pathToAliases, pta); }
@Override public MapWork read(Kryo kryo, Input input, Class<MapWork> type) { MapWork mapWork = super.read(kryo, input, type); // The set methods in MapWork intern the any duplicate strings which is why we call them // during de-serialization mapWork.setPathToPartitionInfo(mapWork.getPathToPartitionInfo()); mapWork.setPathToAliases(mapWork.getPathToAliases()); return mapWork; } }
private static void updatePathForMapWork(Path newPath, MapWork work, Path path) { // update the work if (!newPath.equals(path)) { PartitionDesc partDesc = work.getPathToPartitionInfo().get(path); work.addPathToAlias(newPath, work.getPathToAliases().get(path)); work.removePathToAlias(path); work.removePathToPartitionInfo(path); work.addPathToPartitionInfo(newPath, partDesc); } }
/** * create a new plan and return. The pan won't contain the name to split * sample information in parse context. * * @return the new plan */ public static MapredWork getMapRedWorkFromConf(HiveConf conf) { MapredWork mrWork = new MapredWork(); MapWork work = mrWork.getMapWork(); boolean mapperCannotSpanPartns = conf.getBoolVar( HiveConf.ConfVars.HIVE_MAPPER_CANNOT_SPAN_MULTIPLE_PARTITIONS); work.setMapperCannotSpanPartns(mapperCannotSpanPartns); work.setPathToAliases(new LinkedHashMap<Path, ArrayList<String>>()); work.setPathToPartitionInfo(new LinkedHashMap<Path, PartitionDesc>()); work.setAliasToWork(new LinkedHashMap<String, Operator<? extends OperatorDesc>>()); return mrWork; }
private void handleSampling(Context context, MapWork mWork, JobConf job) throws Exception { assert mWork.getAliasToWork().keySet().size() == 1; String alias = mWork.getAliases().get(0); Operator<?> topOp = mWork.getAliasToWork().get(alias); PartitionDesc partDesc = mWork.getAliasToPartnInfo().get(alias); ArrayList<PartitionDesc> parts = mWork.getPartitionDescs(); List<Path> inputPaths = mWork.getPaths(); PartitionKeySampler sampler = new PartitionKeySampler(); if (mWork.getSamplingType() == MapWork.SAMPLING_ON_PREV_MR) { console.printInfo("Use sampling data created in previous MR"); } else if (mWork.getSamplingType() == MapWork.SAMPLING_ON_START) { console.printInfo("Creating sampling data.."); assert topOp instanceof TableScanOperator; throw new IllegalArgumentException("Invalid sampling type " + mWork.getSamplingType());
private boolean targetsOfSameDPPSink(MapWork first, MapWork second) { Set<String> sources1 = first.getEventSourceColumnNameMap().keySet(); Set<String> sources2 = second.getEventSourceColumnNameMap().keySet(); if (!sources1.equals(sources2)) { return false; Set<String> names1 = first.getEventSourceColumnNameMap().get(source).stream().map( SparkPartitionPruningSinkDesc::stripOffTargetId).collect(Collectors.toSet()); Set<String> names2 = second.getEventSourceColumnNameMap().get(source).stream().map( SparkPartitionPruningSinkDesc::stripOffTargetId).collect(Collectors.toSet()); if (!names1.equals(names2)) { Set<String> types1 = new HashSet<>(first.getEventSourceColumnTypeMap().get(source)); Set<String> types2 = new HashSet<>(second.getEventSourceColumnTypeMap().get(source)); if (!types1.equals(types2)) { return false; Set<TableDesc> tableDescs1 = new HashSet<>(first.getEventSourceTableDescMap().get(source)); Set<TableDesc> tableDescs2 = new HashSet<>(second.getEventSourceTableDescMap().get(source)); if (!tableDescs1.equals(tableDescs2)) { return false; List<ExprNodeDesc> descs1 = first.getEventSourcePartKeyExprMap().get(source); List<ExprNodeDesc> descs2 = second.getEventSourcePartKeyExprMap().get(source); if (descs1.size() != descs2.size()) { return false;
Map<String, Configuration> tableNameToConf = new HashMap<>(); for (Map.Entry<Path, ArrayList<String>> e : conf.getPathToAliases().entrySet()) { List<String> aliases = e.getValue(); if (aliases == null || aliases.isEmpty()) { String tableName = conf.getPathToPartitionInfo().get(e.getKey()).getTableName(); if (tableNameToConf.containsKey(tableName)) { continue; Operator<?> rootOp = conf.getAliasToWork().get(alias); if (!(rootOp instanceof TableScanOperator)) { continue; for (PartitionDesc pd : conf.getPathToPartitionInfo().values()) { if (!tableNameToConf.containsKey(pd.getTableName())) { tableNameToConf.put(pd.getTableName(), hconf); for (PartitionDesc pd: conf.getAliasToPartnInfo().values()) { if (!tableNameToConf.containsKey(pd.getTableName())) { tableNameToConf.put(pd.getTableName(), hconf);
eventDesc.setVertexName(work.getName()); eventDesc.setInputName(work.getAliases().get(0)); if (!work.getEventSourceTableDescMap().containsKey(sourceName)) { work.getEventSourceTableDescMap().put(sourceName, new LinkedList<TableDesc>()); List<TableDesc> tables = work.getEventSourceTableDescMap().get(sourceName); tables.add(event.getConf().getTable()); if (!work.getEventSourceColumnNameMap().containsKey(sourceName)) { work.getEventSourceColumnNameMap().put(sourceName, new LinkedList<String>()); List<String> columns = work.getEventSourceColumnNameMap().get(sourceName); columns.add(eventDesc.getTargetColumnName()); if (!work.getEventSourceColumnTypeMap().containsKey(sourceName)) { work.getEventSourceColumnTypeMap().put(sourceName, new LinkedList<String>()); List<String> columnTypes = work.getEventSourceColumnTypeMap().get(sourceName); columnTypes.add(eventDesc.getTargetColumnType()); if (!work.getEventSourcePartKeyExprMap().containsKey(sourceName)) { work.getEventSourcePartKeyExprMap().put(sourceName, new LinkedList<ExprNodeDesc>()); List<ExprNodeDesc> keys = work.getEventSourcePartKeyExprMap().get(sourceName); keys.add(eventDesc.getPartKey());
private void genSMBJoinWork(MapWork currWork, SMBMapJoinOperator smbJoinOp) { Map<String, PartitionDesc> aliasToPartitionInfo = currWork.getAliasToPartnInfo(); List<Path> removePaths = new ArrayList<>(); for (Map.Entry<Path, ArrayList<String>> entry : currWork.getPathToAliases().entrySet()) { boolean keepPath = false; for (String alias : entry.getValue()) { removeAliases.addAll(currWork.getPathToAliases().get(removePath)); currWork.removePathToAlias(removePath); currWork.removePathToPartitionInfo(removePath); currWork.getAliasToPartnInfo().remove(alias); currWork.getAliasToWork().remove(alias); currWork.getAliasToWork().put(alias, op); PartitionDesc partitionInfo = currWork.getAliasToPartnInfo().get(alias); if (fetchWork.getTblDir() != null) { currWork.mergeAliasedInput(alias, fetchWork.getTblDir(), partitionInfo); } else { for (Path pathDir : fetchWork.getPartDir()) { currWork.mergeAliasedInput(alias, pathDir, partitionInfo);
@Override public Path call() throws Exception { if (!this.skipDummy && isEmptyPath(this.job, this.path, this.ctx)) { return createDummyFileForEmptyPartition(this.path, this.job, this.work.getPathToPartitionInfo().get(this.path), this.hiveScratchDir); } return this.path; } }
private String getAliasForTableScanOperator(MapWork work, TableScanOperator tso) { for (Map.Entry<String, Operator<? extends OperatorDesc>> entry : work .getAliasToWork().entrySet()) { if (entry.getValue() == tso) { return entry.getKey(); } } return null; }
Map<TableDesc, StructObjectInspector> convertedOI = getConvertedOI(tableNameToConf); for (Map.Entry<Path, ArrayList<String>> entry : conf.getPathToAliases().entrySet()) { Path onefile = entry.getKey(); List<String> aliases = entry.getValue(); PartitionDesc partDesc = conf.getPathToPartitionInfo().get(onefile); TableDesc tableDesc = partDesc.getTableDesc(); Configuration newConf = tableNameToConf.get(tableDesc.getTableName()); Operator<? extends OperatorDesc> op = conf.getAliasToWork().get(alias); if (LOG.isDebugEnabled()) { LOG.debug("Adding alias " + alias + " to work list for file "
@SuppressWarnings("rawtypes") private static Path createDummyFileForEmptyTable(JobConf job, MapWork work, Path hiveScratchDir, String alias) throws Exception { TableDesc tableDesc = work.getAliasToPartnInfo().get(alias).getTableDesc(); if (tableDesc.isNonNative()) { // if it does not need native storage, we can't create an empty file for it. return null; } Properties props = tableDesc.getProperties(); HiveOutputFormat outFileFormat = HiveFileFormatUtils.getHiveOutputFormat(job, tableDesc); Path newPath = createEmptyFile(hiveScratchDir, outFileFormat, job, props, false); LOG.info("Changed input file for alias {} to newPath", alias, newPath); // update the work LinkedHashMap<Path, ArrayList<String>> pathToAliases = work.getPathToAliases(); ArrayList<String> newList = new ArrayList<String>(1); newList.add(alias); pathToAliases.put(newPath, newList); work.setPathToAliases(pathToAliases); PartitionDesc pDesc = work.getAliasToPartnInfo().get(alias).clone(); work.addPathToPartitionInfo(newPath, pDesc); return newPath; }
Iterator<String> it = work.getPathToPartitionInfo().keySet().iterator(); while (it.hasNext()) { String p = it.next(); PartitionDesc desc = work.getPathToPartitionInfo().get(p); Map<String, String> spec = desc.getPartSpec(); if (spec == null) { LOG.info("Pruning path: " + p); it.remove(); work.getPathToAliases().remove(p); work.getPaths().remove(p); work.getPartitionDescs().remove(desc);
Path tmpPath = targetWork.getTmpPathForPartitionPruning(); if (tmpPath == null) { Path baseTmpPath = context.parseContext.getContext().getMRTmpPath(); tmpPath = SparkUtilities.generateTmpPathForPartitionPruning(baseTmpPath, targetId); targetWork.setTmpPathForPartitionPruning(tmpPath); LOG.info("Setting tmp path between source work and target work:\n" + tmpPath); desc.setTargetWork(targetWork.getName()); if (!targetWork.getEventSourceTableDescMap().containsKey(sourceId)) { targetWork.getEventSourceTableDescMap().put(sourceId, new LinkedList<TableDesc>()); List<TableDesc> tables = targetWork.getEventSourceTableDescMap().get(sourceId); tables.add(pruningSink.getConf().getTable()); if (!targetWork.getEventSourceColumnNameMap().containsKey(sourceId)) { targetWork.getEventSourceColumnNameMap().put(sourceId, new LinkedList<String>()); List<String> columns = targetWork.getEventSourceColumnNameMap().get(sourceId); columns.add(desc.getTargetColumnName()); if (!targetWork.getEventSourcePartKeyExprMap().containsKey(sourceId)) { targetWork.getEventSourcePartKeyExprMap().put(sourceId, new LinkedList<ExprNodeDesc>()); List<ExprNodeDesc> keys = targetWork.getEventSourcePartKeyExprMap().get(sourceId); keys.add(desc.getPartKey());
MapWork mapWork = new MapWork(); Context context = new Context(jobConf); LinkedHashMap<Path, PartitionDesc> pathToPartitionInfo = new LinkedHashMap<>(); mapWork.getAliasToWork().put(testPartitionName, scanOp); mapWork.setPathToAliases(pathToAliasTable); mapWork.setPathToPartitionInfo(pathToPartitionInfo);
Path taskTmpDirPath = new Path(taskTmpDir); MapWork mWork = plan.getMapWork(); if (!mWork.getPathToAliases().containsKey(taskTmpDirPath)) { taskTmpDir = taskTmpDir.intern(); StringInternUtils.internUriStringsInPath(taskTmpDirPath); TableDesc tt_desc = tt_descLst.get(pos); mWork.addPathToAlias(taskTmpDirPath, taskTmpDir); mWork.addPathToPartitionInfo(taskTmpDirPath, new PartitionDesc(tt_desc, null)); mWork.getAliasToWork().put(taskTmpDir, topOperators.get(pos));
/** * Hive uses tmp directories to capture the output of each FileSinkOperator. * This method creates all necessary tmp directories for FileSinks in the Mapwork. * * @param conf Used to get the right FileSystem * @param mWork Used to find FileSinkOperators * @throws IOException */ public static void createTmpDirs(Configuration conf, MapWork mWork) throws IOException { Map<Path, ArrayList<String>> pa = mWork.getPathToAliases(); if (MapUtils.isNotEmpty(pa)) { // common case: 1 table scan per map-work // rare case: smb joins HashSet<String> aliases = new HashSet<String>(1); List<Operator<? extends OperatorDesc>> ops = new ArrayList<Operator<? extends OperatorDesc>>(); for (List<String> ls : pa.values()) { for (String a : ls) { aliases.add(a); } } for (String a : aliases) { ops.add(mWork.getAliasToWork().get(a)); } createTmpDirs(conf, ops); } }