public static Set<Entry<String, DistributedCacheEntry>> readFileInfoFromConfig(Configuration conf) { int num = conf.getInteger(CACHE_FILE_NUM, 0); if (num == 0) { return Collections.emptySet(); } Map<String, DistributedCacheEntry> cacheFiles = new HashMap<String, DistributedCacheEntry>(); for (int i = 1; i <= num; i++) { String name = conf.getString(CACHE_FILE_NAME + i, null); String filePath = conf.getString(CACHE_FILE_PATH + i, null); boolean isExecutable = conf.getBoolean(CACHE_FILE_EXE + i, false); boolean isDirectory = conf.getBoolean(CACHE_FILE_DIR + i, false); byte[] blobKey = conf.getBytes(CACHE_FILE_BLOB_KEY + i, null); cacheFiles.put(name, new DistributedCacheEntry(filePath, isExecutable, blobKey, isDirectory)); } return cacheFiles.entrySet(); }
/** * Registers a file at the distributed cache under the given name. The file will be accessible * from any user-defined function in the (distributed) runtime under a local path. Files * may be local files (which will be distributed via BlobServer), or files in a distributed file system. * The runtime will copy the files temporarily to a local cache, if needed. * * <p>The {@link org.apache.flink.api.common.functions.RuntimeContext} can be obtained inside UDFs via * {@link org.apache.flink.api.common.functions.RichFunction#getRuntimeContext()} and provides access * {@link org.apache.flink.api.common.cache.DistributedCache} via * {@link org.apache.flink.api.common.functions.RuntimeContext#getDistributedCache()}. * * @param filePath The path of the file, as a URI (e.g. "file:///some/path" or "hdfs://host:port/and/path") * @param name The name under which the file is registered. * @param executable flag indicating whether the file should be executable */ public void registerCachedFile(String filePath, String name, boolean executable){ this.cacheFile.add(new Tuple2<>(name, new DistributedCacheEntry(filePath, executable))); }
/** * Registers a file at the distributed cache under the given name. The file will be accessible * from any user-defined function in the (distributed) runtime under a local path. Files * may be local files (which will be distributed via BlobServer), or files in a distributed file system. * The runtime will copy the files temporarily to a local cache, if needed. * * <p>The {@link org.apache.flink.api.common.functions.RuntimeContext} can be obtained inside UDFs via * {@link org.apache.flink.api.common.functions.RichFunction#getRuntimeContext()} and provides access * {@link org.apache.flink.api.common.cache.DistributedCache} via * {@link org.apache.flink.api.common.functions.RuntimeContext#getDistributedCache()}. * * @param filePath The path of the file, as a URI (e.g. "file:///some/path" or "hdfs://host:port/and/path") * @param name The name under which the file is registered. * @param executable flag indicating whether the file should be executable */ public void registerCachedFile(String filePath, String name, boolean executable) { this.cacheFile.add(new Tuple2<>(name, new DistributedCache.DistributedCacheEntry(filePath, executable))); } }
public static void addUserArtifactEntries(Collection<Tuple2<String, DistributedCache.DistributedCacheEntry>> userArtifacts, JobGraph jobGraph) { if (!userArtifacts.isEmpty()) { try { java.nio.file.Path tmpDir = Files.createTempDirectory("flink-distributed-cache-" + jobGraph.getJobID()); for (Tuple2<String, DistributedCache.DistributedCacheEntry> originalEntry : userArtifacts) { Path filePath = new Path(originalEntry.f1.filePath); boolean isLocalDir = false; try { FileSystem sourceFs = filePath.getFileSystem(); isLocalDir = !sourceFs.isDistributedFS() && sourceFs.getFileStatus(filePath).isDir(); } catch (IOException ioe) { LOG.warn("Could not determine whether {} denotes a local path.", filePath, ioe); } // zip local directories because we only support file uploads DistributedCache.DistributedCacheEntry entry; if (isLocalDir) { Path zip = FileUtils.compressDirectory(filePath, new Path(tmpDir.toString(), filePath.getName() + ".zip")); entry = new DistributedCache.DistributedCacheEntry(zip.toString(), originalEntry.f1.isExecutable, true); } else { entry = new DistributedCache.DistributedCacheEntry(filePath.toString(), originalEntry.f1.isExecutable, false); } jobGraph.addUserArtifact(originalEntry.f0, entry); } } catch (IOException ioe) { throw new FlinkRuntimeException("Could not compress distributed-cache artifacts.", ioe); } } }
public static JobGraph getJobGraph(Configuration flinkConfig, FlinkPlan optPlan, List<URL> jarFiles, List<URL> classpaths, List<URI> libjars, List<URI> files, SavepointRestoreSettings savepointSettings) { JobGraph job; if (optPlan instanceof StreamingPlan) { job = ((StreamingPlan) optPlan).getJobGraph(); job.setSavepointRestoreSettings(savepointSettings); } else { JobGraphGenerator gen = new JobGraphGenerator(flinkConfig); job = gen.compileJobGraph((OptimizedPlan) optPlan); } for (URL jar : jarFiles) { try { job.addJar(new Path(jar.toURI())); } catch (URISyntaxException e) { throw new RuntimeException("URL is invalid. This should not happen.", e); } } for (URI libjar : libjars) { job.addJar(new Path(libjar)); } for (URI file : files) { final String fileKey = file.getFragment() != null ? file.getFragment() : new Path(file).getName(); // Remove the part after '#' in file path since this part has been already set to file key. job.addUserArtifact(fileKey, new DistributedCacheEntry( org.apache.commons.lang3.StringUtils.substringBeforeLast(file.toString(), "#"), false, false)); } job.setClasspaths(classpaths); return job; }
/** * register cache files in program level * @param entry contains all relevant information * @param name user defined name of that file * @throws java.io.IOException */ public void registerCachedFile(String name, DistributedCacheEntry entry) throws IOException { if (!this.cacheFile.containsKey(name)) { try { URI u = new URI(entry.filePath); if (!u.getPath().startsWith("/")) { u = new File(entry.filePath).toURI(); } FileSystem fs = FileSystem.get(u); if (fs.exists(new Path(u.getPath()))) { this.cacheFile.put(name, new DistributedCacheEntry(u.toString(), entry.isExecutable)); } else { throw new IOException("File " + u.toString() + " doesn't exist."); } } catch (URISyntaxException ex) { throw new IOException("Invalid path: " + entry.filePath, ex); } } else { throw new IOException("cache file " + name + "already exists!"); } }
/** * Registers a file at the distributed cache under the given name. The file will be accessible * from any user-defined function in the (distributed) runtime under a local path. Files * may be local files (as long as all relevant workers have access to it), or files in a distributed file system. * The runtime will copy the files temporarily to a local cache, if needed. * * <p>The {@link org.apache.flink.api.common.functions.RuntimeContext} can be obtained inside UDFs via * {@link org.apache.flink.api.common.functions.RichFunction#getRuntimeContext()} and provides access * {@link org.apache.flink.api.common.cache.DistributedCache} via * {@link org.apache.flink.api.common.functions.RuntimeContext#getDistributedCache()}. * * @param filePath The path of the file, as a URI (e.g. "file:///some/path" or "hdfs://host:port/and/path") * @param name The name under which the file is registered. * @param executable flag indicating whether the file should be executable */ public void registerCachedFile(String filePath, String name, boolean executable){ this.cacheFile.add(new Tuple2<>(name, new DistributedCacheEntry(filePath, executable))); }
/** * Registers a file at the distributed cache under the given name. The file will be accessible * from any user-defined function in the (distributed) runtime under a local path. Files * may be local files (which will be distributed via BlobServer), or files in a distributed file system. * The runtime will copy the files temporarily to a local cache, if needed. * * <p>The {@link org.apache.flink.api.common.functions.RuntimeContext} can be obtained inside UDFs via * {@link org.apache.flink.api.common.functions.RichFunction#getRuntimeContext()} and provides access * {@link org.apache.flink.api.common.cache.DistributedCache} via * {@link org.apache.flink.api.common.functions.RuntimeContext#getDistributedCache()}. * * @param filePath The path of the file, as a URI (e.g. "file:///some/path" or "hdfs://host:port/and/path") * @param name The name under which the file is registered. * @param executable flag indicating whether the file should be executable */ public void registerCachedFile(String filePath, String name, boolean executable) { this.cacheFile.add(new Tuple2<>(name, new DistributedCache.DistributedCacheEntry(filePath, executable))); } }
/** * Registers a file at the distributed cache under the given name. The file will be accessible * from any user-defined function in the (distributed) runtime under a local path. Files * may be local files (as long as all relevant workers have access to it), or files in a distributed file system. * The runtime will copy the files temporarily to a local cache, if needed. * * <p>The {@link org.apache.flink.api.common.functions.RuntimeContext} can be obtained inside UDFs via * {@link org.apache.flink.api.common.functions.RichFunction#getRuntimeContext()} and provides access * {@link org.apache.flink.api.common.cache.DistributedCache} via * {@link org.apache.flink.api.common.functions.RuntimeContext#getDistributedCache()}. * * @param filePath The path of the file, as a URI (e.g. "file:///some/path" or "hdfs://host:port/and/path") * @param name The name under which the file is registered. * @param executable flag indicating whether the file should be executable */ public void registerCachedFile(String filePath, String name, boolean executable) { this.cacheFile.add(new Tuple2<>(name, new DistributedCache.DistributedCacheEntry(filePath, executable))); } }
public static Set<Entry<String, DistributedCacheEntry>> readFileInfoFromConfig(Configuration conf) { int num = conf.getInteger(CACHE_FILE_NUM, 0); if (num == 0) { return Collections.emptySet(); } Map<String, DistributedCacheEntry> cacheFiles = new HashMap<String, DistributedCacheEntry>(); for (int i = 1; i <= num; i++) { String name = conf.getString(CACHE_FILE_NAME + i, null); String filePath = conf.getString(CACHE_FILE_PATH + i, null); boolean isExecutable = conf.getBoolean(CACHE_FILE_EXE + i, false); boolean isDirectory = conf.getBoolean(CACHE_FILE_DIR + i, false); byte[] blobKey = conf.getBytes(CACHE_FILE_BLOB_KEY + i, null); cacheFiles.put(name, new DistributedCacheEntry(filePath, isExecutable, blobKey, isDirectory)); } return cacheFiles.entrySet(); }
/** * Registers a file at the distributed cache under the given name. The file will be accessible * from any user-defined function in the (distributed) runtime under a local path. Files * may be local files (which will be distributed via BlobServer), or files in a distributed file system. * The runtime will copy the files temporarily to a local cache, if needed. * * <p>The {@link org.apache.flink.api.common.functions.RuntimeContext} can be obtained inside UDFs via * {@link org.apache.flink.api.common.functions.RichFunction#getRuntimeContext()} and provides access * {@link org.apache.flink.api.common.cache.DistributedCache} via * {@link org.apache.flink.api.common.functions.RuntimeContext#getDistributedCache()}. * * @param filePath The path of the file, as a URI (e.g. "file:///some/path" or "hdfs://host:port/and/path") * @param name The name under which the file is registered. * @param executable flag indicating whether the file should be executable */ public void registerCachedFile(String filePath, String name, boolean executable) { this.cacheFile.add(new Tuple2<>(name, new DistributedCache.DistributedCacheEntry(filePath, executable))); } }
/** * Registers a file at the distributed cache under the given name. The file will be accessible * from any user-defined function in the (distributed) runtime under a local path. Files * may be local files (which will be distributed via BlobServer), or files in a distributed file system. * The runtime will copy the files temporarily to a local cache, if needed. * * <p>The {@link org.apache.flink.api.common.functions.RuntimeContext} can be obtained inside UDFs via * {@link org.apache.flink.api.common.functions.RichFunction#getRuntimeContext()} and provides access * {@link org.apache.flink.api.common.cache.DistributedCache} via * {@link org.apache.flink.api.common.functions.RuntimeContext#getDistributedCache()}. * * @param filePath The path of the file, as a URI (e.g. "file:///some/path" or "hdfs://host:port/and/path") * @param name The name under which the file is registered. * @param executable flag indicating whether the file should be executable */ public void registerCachedFile(String filePath, String name, boolean executable){ this.cacheFile.add(new Tuple2<>(name, new DistributedCacheEntry(filePath, executable))); }
public static Set<Entry<String, DistributedCacheEntry>> readFileInfoFromConfig(Configuration conf) { int num = conf.getInteger(CACHE_FILE_NUM, 0); if (num == 0) { return Collections.emptySet(); } Map<String, DistributedCacheEntry> cacheFiles = new HashMap<String, DistributedCacheEntry>(); for (int i = 1; i <= num; i++) { String name = conf.getString(CACHE_FILE_NAME + i, null); String filePath = conf.getString(CACHE_FILE_PATH + i, null); boolean isExecutable = conf.getBoolean(CACHE_FILE_EXE + i, false); boolean isDirectory = conf.getBoolean(CACHE_FILE_DIR + i, false); byte[] blobKey = conf.getBytes(CACHE_FILE_BLOB_KEY + i, null); cacheFiles.put(name, new DistributedCacheEntry(filePath, isExecutable, blobKey, isDirectory)); } return cacheFiles.entrySet(); }
/** * Registers a file at the distributed cache under the given name. The file will be accessible * from any user-defined function in the (distributed) runtime under a local path. Files * may be local files (as long as all relevant workers have access to it), or files in a distributed file system. * The runtime will copy the files temporarily to a local cache, if needed. * * <p>The {@link org.apache.flink.api.common.functions.RuntimeContext} can be obtained inside UDFs via * {@link org.apache.flink.api.common.functions.RichFunction#getRuntimeContext()} and provides access * {@link org.apache.flink.api.common.cache.DistributedCache} via * {@link org.apache.flink.api.common.functions.RuntimeContext#getDistributedCache()}. * * @param filePath The path of the file, as a URI (e.g. "file:///some/path" or "hdfs://host:port/and/path") * @param name The name under which the file is registered. * @param executable flag indicating whether the file should be executable */ public void registerCachedFile(String filePath, String name, boolean executable){ this.cacheFile.add(new Tuple2<>(name, new DistributedCacheEntry(filePath, executable))); }
/** * Registers a file at the distributed cache under the given name. The file will be accessible * from any user-defined function in the (distributed) runtime under a local path. Files * may be local files (as long as all relevant workers have access to it), or files in a distributed file system. * The runtime will copy the files temporarily to a local cache, if needed. * * <p>The {@link org.apache.flink.api.common.functions.RuntimeContext} can be obtained inside UDFs via * {@link org.apache.flink.api.common.functions.RichFunction#getRuntimeContext()} and provides access * {@link org.apache.flink.api.common.cache.DistributedCache} via * {@link org.apache.flink.api.common.functions.RuntimeContext#getDistributedCache()}. * * @param filePath The path of the file, as a URI (e.g. "file:///some/path" or "hdfs://host:port/and/path") * @param name The name under which the file is registered. * @param executable flag indicating whether the file should be executable */ public void registerCachedFile(String filePath, String name, boolean executable) { this.cacheFile.add(new Tuple2<>(name, new DistributedCache.DistributedCacheEntry(filePath, executable))); } }
public void setUserArtifactBlobKey(String entryName, PermanentBlobKey blobKey) throws IOException { byte[] serializedBlobKey; serializedBlobKey = InstantiationUtil.serializeObject(blobKey); userArtifacts.computeIfPresent(entryName, (key, originalEntry) -> new DistributedCache.DistributedCacheEntry( originalEntry.filePath, originalEntry.isExecutable, serializedBlobKey, originalEntry.isZipped )); }
public void setUserArtifactBlobKey(String entryName, PermanentBlobKey blobKey) throws IOException { byte[] serializedBlobKey; serializedBlobKey = InstantiationUtil.serializeObject(blobKey); userArtifacts.computeIfPresent(entryName, (key, originalEntry) -> new DistributedCache.DistributedCacheEntry( originalEntry.filePath, originalEntry.isExecutable, serializedBlobKey, originalEntry.isZipped )); }
public void setUserArtifactBlobKey(String entryName, PermanentBlobKey blobKey) throws IOException { byte[] serializedBlobKey; serializedBlobKey = InstantiationUtil.serializeObject(blobKey); userArtifacts.computeIfPresent(entryName, (key, originalEntry) -> new DistributedCache.DistributedCacheEntry( originalEntry.filePath, originalEntry.isExecutable, serializedBlobKey, originalEntry.isZipped )); }
public static void addUserArtifactEntries(Collection<Tuple2<String, DistributedCache.DistributedCacheEntry>> userArtifacts, JobGraph jobGraph) { if (!userArtifacts.isEmpty()) { try { java.nio.file.Path tmpDir = Files.createTempDirectory("flink-distributed-cache-" + jobGraph.getJobID()); for (Tuple2<String, DistributedCache.DistributedCacheEntry> originalEntry : userArtifacts) { Path filePath = new Path(originalEntry.f1.filePath); boolean isLocalDir = false; try { FileSystem sourceFs = filePath.getFileSystem(); isLocalDir = !sourceFs.isDistributedFS() && sourceFs.getFileStatus(filePath).isDir(); } catch (IOException ioe) { LOG.warn("Could not determine whether {} denotes a local path.", filePath, ioe); } // zip local directories because we only support file uploads DistributedCache.DistributedCacheEntry entry; if (isLocalDir) { Path zip = FileUtils.compressDirectory(filePath, new Path(tmpDir.toString(), filePath.getName() + ".zip")); entry = new DistributedCache.DistributedCacheEntry(zip.toString(), originalEntry.f1.isExecutable, true); } else { entry = new DistributedCache.DistributedCacheEntry(filePath.toString(), originalEntry.f1.isExecutable, false); } jobGraph.addUserArtifact(originalEntry.f0, entry); } } catch (IOException ioe) { throw new FlinkRuntimeException("Could not compress distributed-cache artifacts.", ioe); } } }
@Test public void testArtifactCompression() throws IOException { Path plainFile1 = tmp.newFile("plainFile1").toPath(); Path plainFile2 = tmp.newFile("plainFile2").toPath(); Path directory1 = tmp.newFolder("directory1").toPath(); Files.createDirectory(directory1.resolve("containedFile1")); Path directory2 = tmp.newFolder("directory2").toPath(); Files.createDirectory(directory2.resolve("containedFile2")); JobGraph jb = new JobGraph(); final String executableFileName = "executableFile"; final String nonExecutableFileName = "nonExecutableFile"; final String executableDirName = "executableDir"; final String nonExecutableDirName = "nonExecutableDIr"; Collection<Tuple2<String, DistributedCache.DistributedCacheEntry>> originalArtifacts = Arrays.asList( Tuple2.of(executableFileName, new DistributedCache.DistributedCacheEntry(plainFile1.toString(), true)), Tuple2.of(nonExecutableFileName, new DistributedCache.DistributedCacheEntry(plainFile2.toString(), false)), Tuple2.of(executableDirName, new DistributedCache.DistributedCacheEntry(directory1.toString(), true)), Tuple2.of(nonExecutableDirName, new DistributedCache.DistributedCacheEntry(directory2.toString(), false)) ); JobGraphGenerator.addUserArtifactEntries(originalArtifacts, jb); Map<String, DistributedCache.DistributedCacheEntry> submittedArtifacts = jb.getUserArtifacts(); DistributedCache.DistributedCacheEntry executableFileEntry = submittedArtifacts.get(executableFileName); assertState(executableFileEntry, true, false); DistributedCache.DistributedCacheEntry nonExecutableFileEntry = submittedArtifacts.get(nonExecutableFileName); assertState(nonExecutableFileEntry, false, false); DistributedCache.DistributedCacheEntry executableDirEntry = submittedArtifacts.get(executableDirName); assertState(executableDirEntry, true, true); DistributedCache.DistributedCacheEntry nonExecutableDirEntry = submittedArtifacts.get(nonExecutableDirName); assertState(nonExecutableDirEntry, false, true); }