public static PartitionDesc getPartitionDescFromPathRecursively( Map<Path, PartitionDesc> pathToPartitionInfo, Path dir, Map<Map<Path, PartitionDesc>, Map<Path, PartitionDesc>> cacheMap, boolean ignoreSchema) throws IOException { PartitionDesc part = doGetPartitionDescFromPath(pathToPartitionInfo, dir); if (part == null && (ignoreSchema || (dir.toUri().getScheme() == null || dir.toUri().getScheme().trim().equals("")) || FileUtils.pathsContainNoScheme(pathToPartitionInfo.keySet()))) { Map<Path, PartitionDesc> newPathToPartitionInfo = null; if (cacheMap != null) { newPathToPartitionInfo = cacheMap.get(pathToPartitionInfo); } if (newPathToPartitionInfo == null) { // still null newPathToPartitionInfo = populateNewPartitionDesc(pathToPartitionInfo); if (cacheMap != null) { cacheMap.put(pathToPartitionInfo, newPathToPartitionInfo); } } part = doGetPartitionDescFromPath(newPathToPartitionInfo, dir); } if (part != null) { return part; } else { throw new IOException("cannot find dir = " + dir.toString() + " in pathToPartitionInfo: " + pathToPartitionInfo.keySet()); } }
private PartitionDesc extractSinglePartSpec(CombineHiveInputSplit hsplit) throws IOException { PartitionDesc part = null; Map<Map<Path,PartitionDesc>, Map<Path,PartitionDesc>> cache = new HashMap<>(); for (Path path : hsplit.getPaths()) { PartitionDesc otherPart = HiveFileFormatUtils.getPartitionDescFromPathRecursively( pathToPartInfo, path, cache); LOG.debug("Found spec for " + path + " " + otherPart + " from " + pathToPartInfo); if (part == null) { part = otherPart; } else if (otherPart != part) { // Assume we should have the exact same object. // TODO: we could also compare the schema and SerDe, and pass only those to the call // instead; most of the time these would be the same and LLAP IO can handle that. LOG.warn("Multiple partitions found; not going to pass a part spec to LLAP IO: {" + part.getPartSpec() + "} and {" + otherPart.getPartSpec() + "}"); return null; } } return part; }
private boolean schemaEvolved(InputSplit s, InputSplit prevSplit, boolean groupAcrossFiles, MapWork work) throws IOException { boolean retval = false; Path path = ((FileSplit) s).getPath(); PartitionDesc pd = HiveFileFormatUtils.getPartitionDescFromPathRecursively( work.getPathToPartitionInfo(), path, cache); String currentDeserializerClass = pd.getDeserializerClassName(); Class<?> currentInputFormatClass = pd.getInputFileFormatClass(); Class<?> previousInputFormatClass = null; String previousDeserializerClass = null; if (prevSplit != null) { Path prevPath = ((FileSplit) prevSplit).getPath(); if (!groupAcrossFiles) { return !path.equals(prevPath); } PartitionDesc prevPD = HiveFileFormatUtils.getPartitionDescFromPathRecursively(work.getPathToPartitionInfo(), prevPath, cache); previousDeserializerClass = prevPD.getDeserializerClassName(); previousInputFormatClass = prevPD.getInputFileFormatClass(); } if ((currentInputFormatClass != previousInputFormatClass) || (!currentDeserializerClass.equals(previousDeserializerClass))) { retval = true; } if (LOG.isDebugEnabled()) { LOG.debug("Adding split " + path + " to src new group? " + retval); } return retval; }
public InputSplit[] doGetSplits(JobConf job, int numSplits) throws IOException { super.init(job); Path[] dirs = FileInputFormat.getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } JobConf newjob = new JobConf(job); ArrayList<InputSplit> result = new ArrayList<InputSplit>(); // for each dir, get the InputFormat, and do getSplits. PartitionDesc part; for (Path dir : dirs) { part = HiveFileFormatUtils .getPartitionDescFromPathRecursively(pathToPartitionInfo, dir, IOPrepareCache.get().allocatePartitionDescMap(), true); // create a new InputFormat instance if this is the first time to see this // class Class inputFormatClass = part.getInputFileFormatClass(); InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job); Utilities.copyTableJobPropertiesToConf(part.getTableDesc(), newjob); FileInputFormat.setInputPaths(newjob, dir); newjob.setInputFormat(inputFormat.getClass()); InputSplit[] iss = inputFormat.getSplits(newjob, numSplits / dirs.length); for (InputSplit is : iss) { result.add(new HiveInputSplit(is, inputFormatClass.getName())); } } return result.toArray(new HiveInputSplit[result.size()]); }
public static void getPartitionValues(VectorizedRowBatchCtx vrbCtx, MapWork mapWork, FileSplit split, Object[] partitionValues) throws IOException { Map<Path, PartitionDesc> pathToPartitionInfo = mapWork.getPathToPartitionInfo(); PartitionDesc partDesc = HiveFileFormatUtils .getPartitionDescFromPathRecursively(pathToPartitionInfo, split.getPath(), IOPrepareCache.get().getPartitionDescMap()); getPartitionValues(vrbCtx, partDesc, partitionValues); }
public CombineHiveInputSplit(JobConf job, CombineFileSplit inputSplitShim, Map<Path, PartitionDesc> pathToPartitionInfo) throws IOException { this.inputSplitShim = inputSplitShim; this.pathToPartitionInfo = pathToPartitionInfo; if (job != null) { if (this.pathToPartitionInfo == null) { this.pathToPartitionInfo = Utilities.getMapWork(job).getPathToPartitionInfo(); } // extract all the inputFormatClass names for each chunk in the // CombinedSplit. Path[] ipaths = inputSplitShim.getPaths(); if (ipaths.length > 0) { PartitionDesc part = HiveFileFormatUtils .getPartitionDescFromPathRecursively(this.pathToPartitionInfo, ipaths[0], IOPrepareCache.get().getPartitionDescMap()); inputFormatClassName = part.getInputFileFormatClass().getName(); } } }
/** * Writable interface. */ @Override public void write(DataOutput out) throws IOException { inputSplitShim.write(out); if (inputFormatClassName == null) { if (pathToPartitionInfo == null) { pathToPartitionInfo = Utilities.getMapWork(getJob()).getPathToPartitionInfo(); } // extract all the inputFormatClass names for each chunk in the // CombinedSplit. PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(pathToPartitionInfo, inputSplitShim.getPath(0), IOPrepareCache.get().getPartitionDescMap()); // create a new InputFormat instance if this is the first time to see // this class inputFormatClassName = part.getInputFileFormatClass().getName(); } out.writeUTF(inputFormatClassName); } }
PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively( pathToPartitionInfo, path, IOPrepareCache.get().allocatePartitionDescMap()); TableDesc tableDesc = part.getTableDesc();
PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively( pathToPartitionInfo, hsplit.getPath(), null); if (LOG.isDebugEnabled()) {
.getPartitionDescFromPathRecursively(pathToPartitionInfo, filePath, IOPrepareCache.get().getPartitionDescMap()); } catch (AssertionError ae) {
public static PartitionDesc getPartitionDescFromPathRecursively( Map<String, PartitionDesc> pathToPartitionInfo, Path dir, Map<Map<String, PartitionDesc>, Map<String, PartitionDesc>> cacheMap, boolean ignoreSchema) throws IOException { PartitionDesc part = doGetPartitionDescFromPath(pathToPartitionInfo, dir); if (part == null && (ignoreSchema || (dir.toUri().getScheme() == null || dir.toUri().getScheme().trim() .equals("")))) { Map<String, PartitionDesc> newPathToPartitionInfo = null; if (cacheMap != null) { newPathToPartitionInfo = cacheMap.get(pathToPartitionInfo); } if (newPathToPartitionInfo == null) { // still null newPathToPartitionInfo = new HashMap<String, PartitionDesc>(); populateNewPartitionDesc(pathToPartitionInfo, newPathToPartitionInfo); if (cacheMap != null) { cacheMap.put(pathToPartitionInfo, newPathToPartitionInfo); } } part = doGetPartitionDescFromPath(newPathToPartitionInfo, dir); } if (part != null) { return part; } else { throw new IOException("cannot find dir = " + dir.toString() + " in pathToPartitionInfo: " + pathToPartitionInfo.keySet()); } }
public static PartitionDesc getPartitionDescFromPathRecursively( Map<String, PartitionDesc> pathToPartitionInfo, Path dir, Map<Map<String, PartitionDesc>, Map<String, PartitionDesc>> cacheMap, boolean ignoreSchema) throws IOException { PartitionDesc part = doGetPartitionDescFromPath(pathToPartitionInfo, dir); if (part == null && (ignoreSchema || (dir.toUri().getScheme() == null || dir.toUri().getScheme().trim() .equals("")) || pathsContainNoScheme(pathToPartitionInfo) ) ) { Map<String, PartitionDesc> newPathToPartitionInfo = null; if (cacheMap != null) { newPathToPartitionInfo = cacheMap.get(pathToPartitionInfo); } if (newPathToPartitionInfo == null) { // still null newPathToPartitionInfo = new HashMap<String, PartitionDesc>(); populateNewPartitionDesc(pathToPartitionInfo, newPathToPartitionInfo); if (cacheMap != null) { cacheMap.put(pathToPartitionInfo, newPathToPartitionInfo); } } part = doGetPartitionDescFromPath(newPathToPartitionInfo, dir);
public InputSplit[] doGetSplits(JobConf job, int numSplits) throws IOException { super.init(job); Path[] dirs = FileInputFormat.getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } JobConf newjob = new JobConf(job); ArrayList<InputSplit> result = new ArrayList<InputSplit>(); // for each dir, get the InputFormat, and do getSplits. for (Path dir : dirs) { PartitionDesc part = HiveFileFormatUtils .getPartitionDescFromPathRecursively(pathToPartitionInfo, dir, IOPrepareCache.get().allocatePartitionDescMap(), true); // create a new InputFormat instance if this is the first time to see this // class Class inputFormatClass = part.getInputFileFormatClass(); InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job); Utilities.copyTableJobPropertiesToConf(part.getTableDesc(), newjob); FileInputFormat.setInputPaths(newjob, dir); newjob.setInputFormat(inputFormat.getClass()); InputSplit[] iss = inputFormat.getSplits(newjob, numSplits / dirs.length); for (InputSplit is : iss) { result.add(new HiveInputSplit(is, inputFormatClass.getName())); } } return result.toArray(new HiveInputSplit[result.size()]); }
public InputSplit[] doGetSplits(JobConf job, int numSplits) throws IOException { super.init(job); Path[] dirs = FileInputFormat.getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } JobConf newjob = new JobConf(job); ArrayList<InputSplit> result = new ArrayList<InputSplit>(); // for each dir, get the InputFormat, and do getSplits. PartitionDesc part; for (Path dir : dirs) { part = HiveFileFormatUtils .getPartitionDescFromPathRecursively(pathToPartitionInfo, dir, IOPrepareCache.get().allocatePartitionDescMap(), true); // create a new InputFormat instance if this is the first time to see this // class Class inputFormatClass = part.getInputFileFormatClass(); InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job); Utilities.copyTableJobPropertiesToConf(part.getTableDesc(), newjob); FileInputFormat.setInputPaths(newjob, dir); newjob.setInputFormat(inputFormat.getClass()); InputSplit[] iss = inputFormat.getSplits(newjob, numSplits / dirs.length); for (InputSplit is : iss) { result.add(new HiveInputSplit(is, inputFormatClass.getName())); } } return result.toArray(new HiveInputSplit[result.size()]); }
public CombineHiveInputSplit(JobConf job, InputSplitShim inputSplitShim) throws IOException { this.inputSplitShim = inputSplitShim; if (job != null) { Map<String, PartitionDesc> pathToPartitionInfo = Utilities .getMapRedWork(job).getPathToPartitionInfo(); // extract all the inputFormatClass names for each chunk in the // CombinedSplit. Path[] ipaths = inputSplitShim.getPaths(); if (ipaths.length > 0) { PartitionDesc part = HiveFileFormatUtils .getPartitionDescFromPathRecursively(pathToPartitionInfo, ipaths[0], IOPrepareCache.get().getPartitionDescMap()); inputFormatClassName = part.getInputFileFormatClass().getName(); } } }
public CombineHiveInputSplit(JobConf job, CombineFileSplit inputSplitShim, Map<String, PartitionDesc> pathToPartitionInfo) throws IOException { this.inputSplitShim = inputSplitShim; this.pathToPartitionInfo = pathToPartitionInfo; if (job != null) { if (this.pathToPartitionInfo == null) { this.pathToPartitionInfo = Utilities.getMapWork(job).getPathToPartitionInfo(); } // extract all the inputFormatClass names for each chunk in the // CombinedSplit. Path[] ipaths = inputSplitShim.getPaths(); if (ipaths.length > 0) { PartitionDesc part = HiveFileFormatUtils .getPartitionDescFromPathRecursively(this.pathToPartitionInfo, ipaths[0], IOPrepareCache.get().getPartitionDescMap()); inputFormatClassName = part.getInputFileFormatClass().getName(); } } }
/** * Writable interface. */ public void write(DataOutput out) throws IOException { inputSplitShim.write(out); if (inputFormatClassName == null) { Map<String, PartitionDesc> pathToPartitionInfo = Utilities .getMapRedWork(getJob()).getPathToPartitionInfo(); // extract all the inputFormatClass names for each chunk in the // CombinedSplit. PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(pathToPartitionInfo, inputSplitShim.getPath(0), IOPrepareCache.get().getPartitionDescMap()); // create a new InputFormat instance if this is the first time to see // this class inputFormatClassName = part.getInputFileFormatClass().getName(); } out.writeUTF(inputFormatClassName); } }
/** * Writable interface. */ @Override public void write(DataOutput out) throws IOException { inputSplitShim.write(out); if (inputFormatClassName == null) { if (pathToPartitionInfo == null) { pathToPartitionInfo = Utilities.getMapWork(getJob()).getPathToPartitionInfo(); } // extract all the inputFormatClass names for each chunk in the // CombinedSplit. PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(pathToPartitionInfo, inputSplitShim.getPath(0), IOPrepareCache.get().getPartitionDescMap()); // create a new InputFormat instance if this is the first time to see // this class inputFormatClassName = part.getInputFileFormatClass().getName(); } out.writeUTF(inputFormatClassName); } }
.getPartitionDescFromPathRecursively(pathToPartitionInfo, split.getPath(), IOPrepareCache.get().getPartitionDescMap());
Path path = ((FileSplit) s).getPath(); PartitionDesc pd = HiveFileFormatUtils.getPartitionDescFromPathRecursively(work.getPathToPartitionInfo(), path, cache); String currentDeserializerClass = pd.getDeserializerClassName(); HiveFileFormatUtils.getPartitionDescFromPathRecursively(work.getPathToPartitionInfo(), prevPath, cache); previousDeserializerClass = prevPD.getDeserializerClassName();