/** * Remove all temporary files and duplicate (double-committed) files from a given directory. */ public static void removeTempOrDuplicateFiles(FileSystem fs, Path path, boolean isBaseDir) throws IOException { removeTempOrDuplicateFiles(fs, path, null,null,null, isBaseDir); }
public static List<Path> removeTempOrDuplicateFiles(FileSystem fs, FileStatus[] fileStats, DynamicPartitionCtx dpCtx, FileSinkDesc conf, Configuration hconf, boolean isBaseDir) throws IOException { return removeTempOrDuplicateFiles(fs, fileStats, dpCtx, conf, hconf, null, isBaseDir); }
/** * Remove all temporary files and duplicate (double-committed) files from a given directory. */ public static void removeTempOrDuplicateFiles(FileSystem fs, Path path) throws IOException { removeTempOrDuplicateFiles(fs, path, null,null,null); }
public static List<Path> removeTempOrDuplicateFiles(FileSystem fs, Path path, DynamicPartitionCtx dpCtx, FileSinkDesc conf, Configuration hconf, boolean isBaseDir) throws IOException { if (path == null) { return null; } List<FileStatus> statusList = HiveStatsUtils.getFileStatusRecurse(path, ((dpCtx == null) ? 1 : dpCtx.getNumDPCols()), fs); FileStatus[] stats = statusList.toArray(new FileStatus[statusList.size()]); return removeTempOrDuplicateFiles(fs, stats, dpCtx, conf, hconf, isBaseDir); }
public static List<Path> removeTempOrDuplicateFiles(FileSystem fs, Path path, DynamicPartitionCtx dpCtx, FileSinkDesc conf, Configuration hconf) throws IOException { if (path == null) { return null; } FileStatus[] stats = HiveStatsUtils.getFileStatusRecurse(path, ((dpCtx == null) ? 1 : dpCtx.getNumDPCols()), fs); return removeTempOrDuplicateFiles(fs, stats, dpCtx, conf, hconf); }
Utilities.rename(fs, tmpPath, intermediatePath); Utilities.removeTempOrDuplicateFiles(fs, intermediatePath, false);
Utilities.rename(fs, tmpPath, intermediatePath); Utilities.removeTempOrDuplicateFiles(fs, intermediatePath);
taskIDToFile = removeTempOrDuplicateFiles(items, fs); return result; taskIDToFile = removeTempOrDuplicateFiles(items, fs); if(taskIDToFile != null && taskIDToFile.size() > 0 && conf != null && conf.getTable() != null && (conf.getTable().getNumBuckets() > taskIDToFile.size()) && !"tez".equalsIgnoreCase(hconf.get(ConfVars.HIVE_EXECUTION_ENGINE.varname))) {
/** * Remove all temporary files and duplicate (double-committed) files from a given directory. * * @return a list of path names corresponding to should-be-created empty buckets. */ public static List<Path> removeTempOrDuplicateFiles(FileSystem fs, FileStatus[] fileStats, DynamicPartitionCtx dpCtx, FileSinkDesc conf, Configuration hconf, Set<Path> filesKept, boolean isBaseDir) throws IOException { int dpLevels = dpCtx == null ? 0 : dpCtx.getNumDPCols(), numBuckets = (conf != null && conf.getTable() != null) ? conf.getTable().getNumBuckets() : 0; return removeTempOrDuplicateFiles( fs, fileStats, null, dpLevels, numBuckets, hconf, null, 0, false, filesKept, isBaseDir); }
finalResults[i] = new PathOnlyFileStatus(mmDirectories.get(i)); List<Path> emptyBuckets = Utilities.removeTempOrDuplicateFiles(fs, finalResults, unionSuffix, dpLevels, mbc == null ? 0 : mbc.numBuckets, hconf, writeId, stmtId, isMmTable, null, isInsertOverwrite);
private List<Path> runRemoveTempOrDuplicateFilesTestCase(String executionEngine, boolean dPEnabled) throws Exception { Configuration hconf = new HiveConf(this.getClass()); // do this to verify that Utilities.removeTempOrDuplicateFiles does not revert to default scheme information hconf.set("fs.defaultFS", "hdfs://should-not-be-used/"); hconf.set(HiveConf.ConfVars.HIVE_EXECUTION_ENGINE.varname, executionEngine); FileSystem localFs = FileSystem.getLocal(hconf); DynamicPartitionCtx dpCtx = getDynamicPartitionCtx(dPEnabled); Path tempDirPath = setupTempDirWithSingleOutputFile(hconf); FileSinkDesc conf = getFileSinkDesc(tempDirPath); List<Path> paths = Utilities.removeTempOrDuplicateFiles(localFs, tempDirPath, dpCtx, conf, hconf, false); String expectedScheme = tempDirPath.toUri().getScheme(); String expectedAuthority = tempDirPath.toUri().getAuthority(); assertPathsMatchSchemeAndAuthority(expectedScheme, expectedAuthority, paths); return paths; }
perfLogger.PerfLogBegin("FileSinkOperator", "RemoveTempOrDuplicateFiles"); List<Path> emptyBuckets = Utilities.removeTempOrDuplicateFiles( fs, statuses, dpCtx, conf, hconf, filesKept, false); perfLogger.PerfLogEnd("FileSinkOperator", "RemoveTempOrDuplicateFiles");
perfLogger.PerfLogBegin("FileSinkOperator", "RemoveTempOrDuplicateFiles"); List<Path> emptyBuckets = Utilities.removeTempOrDuplicateFiles(fs, statuses, dpCtx, conf, hconf); perfLogger.PerfLogEnd("FileSinkOperator", "RemoveTempOrDuplicateFiles");
/** * Remove all temporary files and duplicate (double-committed) files from a given directory. */ public static void removeTempOrDuplicateFiles(FileSystem fs, Path path) throws IOException { removeTempOrDuplicateFiles(fs, path, null); }
/** * Remove all temporary files and duplicate (double-committed) files from a given directory. * * @return a list of path names corresponding to should-be-created empty buckets. */ public static void removeTempOrDuplicateFiles(FileSystem fs, Path path) throws IOException { removeTempOrDuplicateFiles(fs, path, null); }
Utilities.rename(fs, tmpPath, intermediatePath); Utilities.removeTempOrDuplicateFiles(fs, intermediatePath);
public void mvFileToFinalPath(String specPath, Configuration hconf, boolean success, Log log, DynamicPartitionCtx dpCtx) throws IOException, HiveException { FileSystem fs = (new Path(specPath)).getFileSystem(hconf); Path tmpPath = Utilities.toTempPath(specPath); Path intermediatePath = new Path(tmpPath.getParent(), tmpPath.getName() + ".intermediate"); Path finalPath = new Path(specPath); if (success) { if (fs.exists(tmpPath)) { // Step1: rename tmp output folder to intermediate path. After this // point, updates from speculative tasks still writing to tmpPath // will not appear in finalPath. log.info("Moving tmp dir: " + tmpPath + " to: " + intermediatePath); Utilities.rename(fs, tmpPath, intermediatePath); // Step2: remove any tmp file or double-committed output files ArrayList<String> emptyBuckets = Utilities.removeTempOrDuplicateFiles(fs, intermediatePath, dpCtx); // create empty buckets if necessary if (emptyBuckets.size() > 0) { createEmptyBuckets(hconf, emptyBuckets); } // Step3: move to the file destination log.info("Moving tmp dir: " + intermediatePath + " to: " + finalPath); Utilities.renameOrMoveFiles(fs, intermediatePath, finalPath); } } else { fs.delete(tmpPath, true); } }
taskIDToFile = removeTempOrDuplicateFiles(items, fs); removeTempOrDuplicateFiles(items, fs);
Utilities.rename(fs, tmpPath, intermediatePath); Utilities.removeTempOrDuplicateFiles(fs, intermediatePath);
public static void mvFileToFinalPath(Path specPath, Configuration hconf, boolean success, Log log, DynamicPartitionCtx dpCtx, FileSinkDesc conf, Reporter reporter) throws IOException, HiveException { FileSystem fs = specPath.getFileSystem(hconf); Path tmpPath = Utilities.toTempPath(specPath); Path taskTmpPath = Utilities.toTaskTempPath(specPath); if (success) { if (fs.exists(tmpPath)) { // remove any tmp file or double-committed output files ArrayList<String> emptyBuckets = Utilities.removeTempOrDuplicateFiles(fs, tmpPath, dpCtx); // create empty buckets if necessary if (emptyBuckets.size() > 0) { createEmptyBuckets(hconf, emptyBuckets, conf, reporter); } // move to the file destination log.info("Moving tmp dir: " + tmpPath + " to: " + specPath); Utilities.renameOrMoveFiles(fs, tmpPath, specPath); } } else { fs.delete(tmpPath, true); } fs.delete(taskTmpPath, true); }