org.apache.hadoop.hive.ql.plan.MapWork java code examples

Map<Path, ArrayList<String>> sourcePathToAliases = source.getPathToAliases();
Map<Path, PartitionDesc> sourcePathToPartitionInfo = source.getPathToPartitionInfo();
Map<String, Operator<? extends OperatorDesc>> sourceAliasToWork = source.getAliasToWork();
Map<String, PartitionDesc> sourceAliasToPartnInfo = source.getAliasToPartnInfo();
LinkedHashMap<Path, ArrayList<String>> targetPathToAliases = target.getPathToAliases();
LinkedHashMap<Path, PartitionDesc> targetPathToPartitionInfo = target.getPathToPartitionInfo();
Map<String, Operator<? extends OperatorDesc>> targetAliasToWork = target.getAliasToWork();
Map<String, PartitionDesc> targetAliasToPartnInfo = target.getAliasToPartnInfo();
target.setPathToAliases(targetPathToAliases);
target.setPathToPartitionInfo(targetPathToPartitionInfo);

Iterator<Path> it = work.getPathToPartitionInfo().keySet().iterator();
while (it.hasNext()) {
 Path p = it.next();
 PartitionDesc desc = work.getPathToPartitionInfo().get(p);
 Map<String, String> spec = desc.getPartSpec();
 if (spec == null) {
  LOG.info("Pruning path: " + p);
  it.remove();
  work.removePathToAlias(p);
  work.getPartitionDescs().remove(desc);

@Test
public void testGetAndSetConsistency() {
 MapWork mw = new MapWork();
 LinkedHashMap<Path, ArrayList<String>> pathToAliases = new LinkedHashMap<>();
 pathToAliases.put(new Path("p0"), Lists.newArrayList("a1", "a2"));
 mw.setPathToAliases(pathToAliases);
 LinkedHashMap<Path, ArrayList<String>> pta = mw.getPathToAliases();
 assertEquals(pathToAliases, pta);
}

 @Override
 public MapWork read(Kryo kryo, Input input, Class<MapWork> type) {
  MapWork mapWork = super.read(kryo, input, type);
  // The set methods in MapWork intern the any duplicate strings which is why we call them
  // during de-serialization
  mapWork.setPathToPartitionInfo(mapWork.getPathToPartitionInfo());
  mapWork.setPathToAliases(mapWork.getPathToAliases());
  return mapWork;
 }
}

private static void updatePathForMapWork(Path newPath, MapWork work, Path path) {
 // update the work
 if (!newPath.equals(path)) {
  PartitionDesc partDesc = work.getPathToPartitionInfo().get(path);
  work.addPathToAlias(newPath, work.getPathToAliases().get(path));
  work.removePathToAlias(path);
  work.removePathToPartitionInfo(path);
  work.addPathToPartitionInfo(newPath, partDesc);
 }
}

/**
 * create a new plan and return. The pan won't contain the name to split
 * sample information in parse context.
 *
 * @return the new plan
 */
public static MapredWork getMapRedWorkFromConf(HiveConf conf) {
 MapredWork mrWork = new MapredWork();
 MapWork work = mrWork.getMapWork();
 boolean mapperCannotSpanPartns =
   conf.getBoolVar(
     HiveConf.ConfVars.HIVE_MAPPER_CANNOT_SPAN_MULTIPLE_PARTITIONS);
 work.setMapperCannotSpanPartns(mapperCannotSpanPartns);
 work.setPathToAliases(new LinkedHashMap<Path, ArrayList<String>>());
 work.setPathToPartitionInfo(new LinkedHashMap<Path, PartitionDesc>());
 work.setAliasToWork(new LinkedHashMap<String, Operator<? extends OperatorDesc>>());
 return mrWork;
}

private void handleSampling(Context context, MapWork mWork, JobConf job)
  throws Exception {
 assert mWork.getAliasToWork().keySet().size() == 1;
 String alias = mWork.getAliases().get(0);
 Operator<?> topOp = mWork.getAliasToWork().get(alias);
 PartitionDesc partDesc = mWork.getAliasToPartnInfo().get(alias);
 ArrayList<PartitionDesc> parts = mWork.getPartitionDescs();
 List<Path> inputPaths = mWork.getPaths();
 PartitionKeySampler sampler = new PartitionKeySampler();
 if (mWork.getSamplingType() == MapWork.SAMPLING_ON_PREV_MR) {
  console.printInfo("Use sampling data created in previous MR");
 } else if (mWork.getSamplingType() == MapWork.SAMPLING_ON_START) {
  console.printInfo("Creating sampling data..");
  assert topOp instanceof TableScanOperator;
  throw new IllegalArgumentException("Invalid sampling type " + mWork.getSamplingType());

private boolean targetsOfSameDPPSink(MapWork first, MapWork second) {
 Set<String> sources1 = first.getEventSourceColumnNameMap().keySet();
 Set<String> sources2 = second.getEventSourceColumnNameMap().keySet();
 if (!sources1.equals(sources2)) {
  return false;
  Set<String> names1 = first.getEventSourceColumnNameMap().get(source).stream().map(
    SparkPartitionPruningSinkDesc::stripOffTargetId).collect(Collectors.toSet());
  Set<String> names2 = second.getEventSourceColumnNameMap().get(source).stream().map(
    SparkPartitionPruningSinkDesc::stripOffTargetId).collect(Collectors.toSet());
  if (!names1.equals(names2)) {
  Set<String> types1 = new HashSet<>(first.getEventSourceColumnTypeMap().get(source));
  Set<String> types2 = new HashSet<>(second.getEventSourceColumnTypeMap().get(source));
  if (!types1.equals(types2)) {
   return false;
  Set<TableDesc> tableDescs1 = new HashSet<>(first.getEventSourceTableDescMap().get(source));
  Set<TableDesc> tableDescs2 = new HashSet<>(second.getEventSourceTableDescMap().get(source));
  if (!tableDescs1.equals(tableDescs2)) {
   return false;
  List<ExprNodeDesc> descs1 = first.getEventSourcePartKeyExprMap().get(source);
  List<ExprNodeDesc> descs2 = second.getEventSourcePartKeyExprMap().get(source);
  if (descs1.size() != descs2.size()) {
   return false;

Map<String, Configuration> tableNameToConf = new HashMap<>();
for (Map.Entry<Path, ArrayList<String>> e : conf.getPathToAliases().entrySet()) {
 List<String> aliases = e.getValue();
 if (aliases == null || aliases.isEmpty()) {
 String tableName = conf.getPathToPartitionInfo().get(e.getKey()).getTableName();
 if (tableNameToConf.containsKey(tableName)) {
  continue;
  Operator<?> rootOp = conf.getAliasToWork().get(alias);
  if (!(rootOp instanceof TableScanOperator)) {
   continue;
for (PartitionDesc pd : conf.getPathToPartitionInfo().values()) {
 if (!tableNameToConf.containsKey(pd.getTableName())) {
  tableNameToConf.put(pd.getTableName(), hconf);
for (PartitionDesc pd: conf.getAliasToPartnInfo().values()) {
 if (!tableNameToConf.containsKey(pd.getTableName())) {
  tableNameToConf.put(pd.getTableName(), hconf);

eventDesc.setVertexName(work.getName());
eventDesc.setInputName(work.getAliases().get(0));
if (!work.getEventSourceTableDescMap().containsKey(sourceName)) {
 work.getEventSourceTableDescMap().put(sourceName, new LinkedList<TableDesc>());
List<TableDesc> tables = work.getEventSourceTableDescMap().get(sourceName);
tables.add(event.getConf().getTable());
if (!work.getEventSourceColumnNameMap().containsKey(sourceName)) {
 work.getEventSourceColumnNameMap().put(sourceName, new LinkedList<String>());
List<String> columns = work.getEventSourceColumnNameMap().get(sourceName);
columns.add(eventDesc.getTargetColumnName());
if (!work.getEventSourceColumnTypeMap().containsKey(sourceName)) {
 work.getEventSourceColumnTypeMap().put(sourceName, new LinkedList<String>());
List<String> columnTypes = work.getEventSourceColumnTypeMap().get(sourceName);
columnTypes.add(eventDesc.getTargetColumnType());
if (!work.getEventSourcePartKeyExprMap().containsKey(sourceName)) {
 work.getEventSourcePartKeyExprMap().put(sourceName, new LinkedList<ExprNodeDesc>());
List<ExprNodeDesc> keys = work.getEventSourcePartKeyExprMap().get(sourceName);
keys.add(eventDesc.getPartKey());

private void genSMBJoinWork(MapWork currWork, SMBMapJoinOperator smbJoinOp) {
 Map<String, PartitionDesc> aliasToPartitionInfo = currWork.getAliasToPartnInfo();
 List<Path> removePaths = new ArrayList<>();
 for (Map.Entry<Path, ArrayList<String>> entry : currWork.getPathToAliases().entrySet()) {
  boolean keepPath = false;
  for (String alias : entry.getValue()) {
  removeAliases.addAll(currWork.getPathToAliases().get(removePath));
  currWork.removePathToAlias(removePath);
  currWork.removePathToPartitionInfo(removePath);
  currWork.getAliasToPartnInfo().remove(alias);
  currWork.getAliasToWork().remove(alias);
  currWork.getAliasToWork().put(alias, op);
  PartitionDesc partitionInfo = currWork.getAliasToPartnInfo().get(alias);
  if (fetchWork.getTblDir() != null) {
   currWork.mergeAliasedInput(alias, fetchWork.getTblDir(), partitionInfo);
  } else {
   for (Path pathDir : fetchWork.getPartDir()) {
    currWork.mergeAliasedInput(alias, pathDir, partitionInfo);

 @Override
 public Path call() throws Exception {
  if (!this.skipDummy && isEmptyPath(this.job, this.path, this.ctx)) {
   return createDummyFileForEmptyPartition(this.path, this.job, this.work.getPathToPartitionInfo().get(this.path),
       this.hiveScratchDir);
  }
  return this.path;
 }
}

private String getAliasForTableScanOperator(MapWork work,
  TableScanOperator tso) {
 for (Map.Entry<String, Operator<? extends OperatorDesc>> entry : work
   .getAliasToWork().entrySet()) {
  if (entry.getValue() == tso) {
   return entry.getKey();
  }
 }
 return null;
}

Map<TableDesc, StructObjectInspector> convertedOI = getConvertedOI(tableNameToConf);
for (Map.Entry<Path, ArrayList<String>> entry : conf.getPathToAliases().entrySet()) {
 Path onefile = entry.getKey();
 List<String> aliases = entry.getValue();
 PartitionDesc partDesc = conf.getPathToPartitionInfo().get(onefile);
 TableDesc tableDesc = partDesc.getTableDesc();
 Configuration newConf = tableNameToConf.get(tableDesc.getTableName());
  Operator<? extends OperatorDesc> op = conf.getAliasToWork().get(alias);
  if (LOG.isDebugEnabled()) {
   LOG.debug("Adding alias " + alias + " to work list for file "

@SuppressWarnings("rawtypes")
private static Path createDummyFileForEmptyTable(JobConf job, MapWork work,
  Path hiveScratchDir, String alias)
    throws Exception {
 TableDesc tableDesc = work.getAliasToPartnInfo().get(alias).getTableDesc();
 if (tableDesc.isNonNative()) {
  // if it does not need native storage, we can't create an empty file for it.
  return null;
 }
 Properties props = tableDesc.getProperties();
 HiveOutputFormat outFileFormat = HiveFileFormatUtils.getHiveOutputFormat(job, tableDesc);
 Path newPath = createEmptyFile(hiveScratchDir, outFileFormat, job, props, false);
 LOG.info("Changed input file for alias {} to newPath", alias, newPath);
 // update the work
 LinkedHashMap<Path, ArrayList<String>> pathToAliases = work.getPathToAliases();
 ArrayList<String> newList = new ArrayList<String>(1);
 newList.add(alias);
 pathToAliases.put(newPath, newList);
 work.setPathToAliases(pathToAliases);
 PartitionDesc pDesc = work.getAliasToPartnInfo().get(alias).clone();
 work.addPathToPartitionInfo(newPath, pDesc);
 return newPath;
}

Iterator<String> it = work.getPathToPartitionInfo().keySet().iterator();
while (it.hasNext()) {
 String p = it.next();
 PartitionDesc desc = work.getPathToPartitionInfo().get(p);
 Map<String, String> spec = desc.getPartSpec();
 if (spec == null) {
  LOG.info("Pruning path: " + p);
  it.remove();
  work.getPathToAliases().remove(p);
  work.getPaths().remove(p);
  work.getPartitionDescs().remove(desc);

Path tmpPath = targetWork.getTmpPathForPartitionPruning();
if (tmpPath == null) {
 Path baseTmpPath = context.parseContext.getContext().getMRTmpPath();
 tmpPath = SparkUtilities.generateTmpPathForPartitionPruning(baseTmpPath, targetId);
 targetWork.setTmpPathForPartitionPruning(tmpPath);
 LOG.info("Setting tmp path between source work and target work:\n" + tmpPath);
desc.setTargetWork(targetWork.getName());
if (!targetWork.getEventSourceTableDescMap().containsKey(sourceId)) {
 targetWork.getEventSourceTableDescMap().put(sourceId, new LinkedList<TableDesc>());
List<TableDesc> tables = targetWork.getEventSourceTableDescMap().get(sourceId);
tables.add(pruningSink.getConf().getTable());
if (!targetWork.getEventSourceColumnNameMap().containsKey(sourceId)) {
 targetWork.getEventSourceColumnNameMap().put(sourceId, new LinkedList<String>());
List<String> columns = targetWork.getEventSourceColumnNameMap().get(sourceId);
columns.add(desc.getTargetColumnName());
if (!targetWork.getEventSourcePartKeyExprMap().containsKey(sourceId)) {
 targetWork.getEventSourcePartKeyExprMap().put(sourceId, new LinkedList<ExprNodeDesc>());
List<ExprNodeDesc> keys = targetWork.getEventSourcePartKeyExprMap().get(sourceId);
keys.add(desc.getPartKey());

MapWork mapWork = new MapWork();
Context context = new Context(jobConf);
LinkedHashMap<Path, PartitionDesc> pathToPartitionInfo = new LinkedHashMap<>();
 mapWork.getAliasToWork().put(testPartitionName, scanOp);
mapWork.setPathToAliases(pathToAliasTable);
mapWork.setPathToPartitionInfo(pathToPartitionInfo);

Path taskTmpDirPath = new Path(taskTmpDir);
MapWork mWork = plan.getMapWork();
if (!mWork.getPathToAliases().containsKey(taskTmpDirPath)) {
 taskTmpDir = taskTmpDir.intern();
 StringInternUtils.internUriStringsInPath(taskTmpDirPath);
 TableDesc tt_desc = tt_descLst.get(pos);
 mWork.addPathToAlias(taskTmpDirPath, taskTmpDir);
 mWork.addPathToPartitionInfo(taskTmpDirPath, new PartitionDesc(tt_desc, null));
 mWork.getAliasToWork().put(taskTmpDir, topOperators.get(pos));

/**
 * Hive uses tmp directories to capture the output of each FileSinkOperator.
 * This method creates all necessary tmp directories for FileSinks in the Mapwork.
 *
 * @param conf Used to get the right FileSystem
 * @param mWork Used to find FileSinkOperators
 * @throws IOException
 */
public static void createTmpDirs(Configuration conf, MapWork mWork)
  throws IOException {
 Map<Path, ArrayList<String>> pa = mWork.getPathToAliases();
 if (MapUtils.isNotEmpty(pa)) {
  // common case: 1 table scan per map-work
  // rare case: smb joins
  HashSet<String> aliases = new HashSet<String>(1);
  List<Operator<? extends OperatorDesc>> ops =
    new ArrayList<Operator<? extends OperatorDesc>>();
  for (List<String> ls : pa.values()) {
   for (String a : ls) {
    aliases.add(a);
   }
  }
  for (String a : aliases) {
   ops.add(mWork.getAliasToWork().get(a));
  }
  createTmpDirs(conf, ops);
 }
}

Javadoc

MapWork represents all the information used to run a map task on the cluster. It is first used when the query planner breaks the logical plan into tasks and used throughout physical optimization to track map-side operator plans, input paths, aliases, etc. ExecDriver will serialize the contents of this class and make sure it is distributed on the cluster. The ExecMapper will ultimately deserialize this class on the data nodes and setup it's operator pipeline accordingly. This class is also used in the explain command any property with the appropriate annotation will be displayed in the explain output.

Most used methods

Popular in Java

Creating JSON documents from java classes using gson
orElseThrow (Optional)
Return the contained value, if present, otherwise throw an exception to be created by the provided s
onCreateOptionsMenu (Activity)
getSharedPreferences (Context)
File (java.io)
An "abstract" representation of a file system entity identified by a pathname. The pathname may be a
Selector (java.nio.channels)
A controller for the selection of SelectableChannel objects. Selectable channels can be registered w
HashMap (java.util)
HashMap is an implementation of Map. All optional operations are supported.All elements are permitte
ConcurrentHashMap (java.util.concurrent)
A plug-in replacement for JDK1.5 java.util.concurrent.ConcurrentHashMap. This version is based on or
VirtualMachine (com.sun.tools.attach)
A Java virtual machine. A VirtualMachine represents a Java virtual machine to which this Java vir
JOptionPane (javax.swing)
Top plugins for WebStorm

How to useMapWork in org.apache.hadoop.hive.ql.plan

Best Java code snippets using org.apache.hadoop.hive.ql.plan.MapWork (Showing top 20 results out of 315)

How to use
MapWork
in
org.apache.hadoop.hive.ql.plan