compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(fileSplit.getPath()); if (isCompressedInput()) { LOG.info(format("Input file is compressed. Using compression code %s", codec.getClass().getName())); in = codec.createInputStream(fileIn); } else { LOG.info("The input file is not compressed");
public static CompressionCodec createCompressionCodec(String className) throws Exception { Configuration configuration = new Configuration(); CompressionCodecFactory.setCodecClasses(configuration,new LinkedList<Class>(Collections.singletonList(Class.forName(className)))); CompressionCodecFactory ccf = new CompressionCodecFactory( configuration); return ccf.getCodecByClassName(className); } }
@Override Path createOutputFile() throws IOException { Path p = new Path(this.fsUrl + this.fileNameFormat.getPath(), this.fileNameFormat.getName(this.rotation, System.currentTimeMillis())); this.writer = SequenceFile.createWriter( this.hdfsConfig, SequenceFile.Writer.file(p), SequenceFile.Writer.keyClass(this.format.keyClass()), SequenceFile.Writer.valueClass(this.format.valueClass()), SequenceFile.Writer.compression(this.compressionType, this.codecFactory.getCodecByName(this.compressionCodec)) ); return p; }
private CompressionCodec getCompressionCodec(Configuration conf, String compressionCodecName) { checkNotNull(conf); checkNotNull(compressionCodecName); if (compressionCodecName.equals(NO_COMPRESSION)) { return null; } CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodecByName(compressionCodecName); if (codec == null) { throw new RuntimeException("Codec " + compressionCodecName + " not found."); } return codec; } }
/** * Find the codecs specified in the config value io.compression.codecs * and register them. Defaults to gzip and deflate. */ public CompressionCodecFactory(Configuration conf) { codecs = new TreeMap<String, CompressionCodec>(); codecsByClassName = new HashMap<String, CompressionCodec>(); codecsByName = new HashMap<String, CompressionCodec>(); List<Class<? extends CompressionCodec>> codecClasses = getCodecClasses(conf); if (codecClasses == null || codecClasses.isEmpty()) { addCodec(new GzipCodec()); addCodec(new DefaultCodec()); } else { for (Class<? extends CompressionCodec> codecClass : codecClasses) { addCodec(ReflectionUtils.newInstance(codecClass, conf)); } } }
codec = getCompressionCodec(context, getConfiguration()); final CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf); for (final Path file : files) { try { if (!getUserGroupInformation().doAs((PrivilegedExceptionAction<Boolean>) () -> hdfs.exists(file))) { continue; // if file is no longer there then move on final String originalFilename = file.getName(); final String relativePath = getPathDifference(rootDir, file); stream = getUserGroupInformation().doAs((PrivilegedExceptionAction<FSDataInputStream>) () -> hdfs.open(file, bufferSize)); codec = compressionCodecFactory.getCodec(file); flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), outputFilename); if (!keepSourceFiles && !getUserGroupInformation().doAs((PrivilegedExceptionAction<Boolean>) () -> hdfs.delete(file, false))) { getLogger().warn("Could not remove {} from HDFS. Not ingesting this file ...", new Object[]{file}); session.getProvenanceReporter().receive(flowFile, file.toString()); session.transfer(flowFile, REL_SUCCESS); getLogger().info("retrieved {} from HDFS {} in {} milliseconds at a rate of {}",
public static void decompressFile(final FileSystem fs, final String inFile, final String outFile, boolean deletePrevious) throws IOException { final Path inPath = new Path(inFile); final Path outPath = new Path(outFile); final CompressionCodecFactory factory = new CompressionCodecFactory(new Configuration()); final CompressionCodec codec = factory.getCodec(inPath); final OutputStream out = fs.create(outPath); final InputStream in = codec.createInputStream(fs.open(inPath)); IOUtils.copyBytes(in, out, 8192); IOUtils.closeStream(in); IOUtils.closeStream(out); if (deletePrevious) fs.delete(new Path(inFile), true); }
public static void testFinding() { CompressionCodecFactory factory = new CompressionCodecFactory(new Configuration()); CompressionCodec codec = factory.getCodec(new Path("/tmp/foo.bar")); assertEquals("default factory foo codec", null, codec); codec = factory.getCodecByClassName(BarCodec.class.getCanonicalName()); assertEquals("default factory foo codec", null, codec); codec = factory.getCodec(new Path("/tmp/foo.gz")); checkCodec("default factory for .gz", GzipCodec.class, codec); codec = factory.getCodecByClassName(GzipCodec.class.getCanonicalName()); checkCodec("default factory for gzip codec", GzipCodec.class, codec); codec = factory.getCodecByName("gzip"); checkCodec("default factory for gzip codec", GzipCodec.class, codec); codec = factory.getCodecByName("GZIP"); checkCodec("default factory for gzip codec", GzipCodec.class, codec); codec = factory.getCodecByName("GZIPCodec"); checkCodec("default factory for gzip codec", GzipCodec.class, codec); codec = factory.getCodecByName("gzipcodec"); checkCodec("default factory for gzip codec", GzipCodec.class, codec); Class klass = factory.getCodecClassByName("gzipcodec"); assertEquals(GzipCodec.class, klass); codec = factory.getCodec(new Path("/tmp/foo.bz2")); checkCodec("default factory for .bz2", BZip2Codec.class, codec); codec = factory.getCodecByClassName(BZip2Codec.class.getCanonicalName()); checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec); codec = factory.getCodecByName("bzip2"); checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec); codec = factory.getCodecByName("bzip2codec");
Configuration conf = new Configuration(); CompressionCodecFactory factory = new CompressionCodecFactory(conf); boolean encode = false; for(int i=0; i < args.length; ++i) { encode = false; } else { CompressionCodec codec = factory.getCodec(new Path(args[i])); if (codec == null) { System.out.println("Codec for " + args[i] + " not found."); java.io.InputStream in = null; try { out = codec.createOutputStream( new java.io.FileOutputStream(args[i])); byte[] buffer = new byte[100]; String inFilename = removeSuffix(args[i], codec.getDefaultExtension()); in = new java.io.FileInputStream(inFilename); int len = in.read(buffer); CompressionInputStream in = null; try { in = codec.createInputStream( new java.io.FileInputStream(args[i])); byte[] buffer = new byte[100];
private static List<String> readLines(Path location, Configuration conf) throws Exception { FileSystem fileSystem = FileSystem.get(location.toUri(), conf); CompressionCodecFactory factory = new CompressionCodecFactory(conf); FileStatus[] items = fileSystem.listStatus(location); if (items == null) return new ArrayList<String>(); List<String> results = new ArrayList<String>(); for (FileStatus item : items) { // ignoring files like _SUCCESS if (item.getPath().getName().startsWith("_")) { continue; } CompressionCodec codec = factory.getCodec(item.getPath()); InputStream stream = null; // check if we have a compression codec we need to use if (codec != null) { stream = codec.createInputStream(fileSystem.open(item.getPath())); } else { stream = fileSystem.open(item.getPath()); } StringWriter writer = new StringWriter(); IOUtils.copy(stream, writer, "UTF-8"); String raw = writer.toString(); for (String str : raw.split("\n")) { results.add(str); } } return results; }
@Override public void initialize(final InputSplit genericSplit, final TaskAttemptContext context) throws IOException { final FileSplit split = (FileSplit) genericSplit; final Configuration configuration = context.getConfiguration(); if (configuration.get(Constants.GREMLIN_HADOOP_GRAPH_FILTER, null) != null) this.graphFilter = VertexProgramHelper.deserialize(ConfUtil.makeApacheConfiguration(configuration), Constants.GREMLIN_HADOOP_GRAPH_FILTER); this.gryoReader = GryoReader.build().mapper( GryoMapper.build().addRegistries(IoRegistryHelper.createRegistries(ConfUtil.makeApacheConfiguration(configuration))).create()).create(); long start = split.getStart(); final Path file = split.getPath(); if (null != new CompressionCodecFactory(configuration).getCodec(file)) { throw new IllegalStateException("Compression is not supported for the (binary) Gryo format"); } // open the file and seek to the start of the split this.inputStream = file.getFileSystem(configuration).open(split.getPath()); this.splitLength = split.getLength(); if (this.splitLength > 0) this.splitLength -= (seekToHeader(this.inputStream, start) - start); }
CompressionCodec codec = null; Configuration conf = getConfiguration(); final CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf); final CompressionType compressionType = CompressionType.valueOf(context.getProperty(COMPRESSION_CODEC).toString()); final boolean inferCompressionCodec = compressionType == CompressionType.AUTOMATIC; codec = compressionCodecFactory.getCodec(path); } else if (compressionType != CompressionType.NONE) { codec = getCompressionCodec(context, getConfiguration()); final Path qualifiedPath = path.makeQualified(hdfs.getUri(), hdfs.getWorkingDirectory()); try { final String outputFilename; final String originalFilename = path.getName(); stream = hdfs.open(path, 16384); stream = codec.createInputStream(stream); outputFilename = StringUtils.removeEnd(originalFilename, codec.getDefaultExtension()); } else { outputFilename = originalFilename; session.getProvenanceReporter().fetch(flowFile, qualifiedPath.toString(), stopWatch.getDuration(TimeUnit.MILLISECONDS)); session.transfer(flowFile, REL_SUCCESS); } catch (final FileNotFoundException | AccessControlException e) {
private InputStream openPartFileAsStream(FileStatus fileStatus) throws IOException { CompressionCodecFactory compressionFactory = new CompressionCodecFactory(new Configuration()); InputStream is = null; FileSystem fs = ShifuFileUtils.getFileSystemBySourceType(sourceType); CompressionCodec codec = compressionFactory.getCodec(fileStatus.getPath()); if (codec != null) { is = codec.createInputStream(fs.open(fileStatus.getPath())); } else { is = fs.open(fileStatus.getPath()); } return is; }
/** * Returns an {@link InputStream} to the specified file. * <p> * Note: It is the caller's responsibility to close the returned {@link InputStream}. * </p> * * @param path The path to the file to open. * @return An {@link InputStream} for the specified file. * @throws FileBasedHelperException if there is a problem opening the {@link InputStream} for the specified file. */ @Override public InputStream getFileStream(String path) throws FileBasedHelperException { try { Path p = new Path(path); InputStream in = this.getFileSystem().open(p); // Account for compressed files (e.g. gzip). // https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/input/WholeTextFileRecordReader.scala CompressionCodecFactory factory = new CompressionCodecFactory(this.getFileSystem().getConf()); CompressionCodec codec = factory.getCodec(p); return (codec == null) ? in : codec.createInputStream(in); } catch (IOException e) { throw new FileBasedHelperException("Cannot open file " + path + " due to " + e.getMessage(), e); } }
@Override public Writer<FileRegion> getWriter(Writer.Options opts, String blockPoolID) throws IOException { if (null == opts) { opts = writerOpts; } if (!(opts instanceof WriterOptions)) { throw new IllegalArgumentException("Invalid options " + opts.getClass()); } WriterOptions o = (WriterOptions) opts; Configuration cfg = (null == o.getConf()) ? new Configuration() : o.getConf(); String baseName = fileNameFromBlockPoolID(blockPoolID); Path blocksFile = new Path(o.dir, baseName); if (o.codec != null) { CompressionCodecFactory factory = new CompressionCodecFactory(cfg); CompressionCodec codec = factory.getCodecByName(o.codec); blocksFile = new Path(o.dir, baseName + codec.getDefaultExtension()); return createWriter(blocksFile, codec, o.delim, cfg); } return createWriter(blocksFile, null, o.delim, conf); }
private DataInputStream getDataInputStream(String path, FileFactory.FileType fileType, int bufferSize, String compressor, Configuration configuration) throws IOException { path = path.replace("\\", "/"); Path pt = new Path(path); InputStream inputStream; FileSystem fs = pt.getFileSystem(configuration); if (bufferSize <= 0) { inputStream = fs.open(pt); } else { inputStream = fs.open(pt, bufferSize); } String codecName = getCodecNameFromCompressor(compressor); if (!codecName.isEmpty()) { CompressionCodec codec = new CompressionCodecFactory(hadoopConf).getCodecByName(codecName); inputStream = codec.createInputStream(inputStream); } if (bufferSize <= 0 && inputStream instanceof FSDataInputStream) { return (DataInputStream) inputStream; } else { return new DataInputStream(new BufferedInputStream(inputStream)); } }
@Override public void init(Path path, Configuration conf) throws IOException { FileSystem fs = path.getFileSystem(conf); CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(path); OutputStream output; if (codec != null) { compressor = CodecPool.getCompressor(codec); output = codec.createOutputStream(fs.create(path), compressor); } else { output = fs.create(path); } writer = new JsonObjectMapperWriter<T>(output, conf.getBoolean("rumen.output.pretty.print", true)); }
@VisibleForTesting TextReader createReader(Path file, String delim, Configuration cfg, String blockPoolID) throws IOException { FileSystem fs = file.getFileSystem(cfg); if (fs instanceof LocalFileSystem) { fs = ((LocalFileSystem)fs).getRaw(); } CompressionCodecFactory factory = new CompressionCodecFactory(cfg); CompressionCodec codec = factory.getCodec(file); String filename = fileNameFromBlockPoolID(blockPoolID); if (codec != null) { filename = filename + codec.getDefaultExtension(); } Path bpidFilePath = new Path(file.getParent(), filename); return new TextReader(fs, bpidFilePath, codec, delim); }
public DataStreamRecordReader(InputSplit inputSplit, JobConf conf) throws Exception { super(); this.split = (FileSplit) inputSplit; @SuppressWarnings("unchecked") Class<? extends Stream> streamClass = (Class<? extends Stream>) Class .forName(conf.get("input.stream.class", "stream.io.CsvStream")); final Path file = split.getPath(); final FileSystem fs = file.getFileSystem(conf); log.info("Creating new DataStreamRecordReader..."); log.info("...file = " + file.toString()); log.info("...streamClass = " + streamClass.getName()); log.info("...inputSplit = " + inputSplit.toString()); inputStream = fs.open(file); InputStream in = inputStream; inputStream.seek(split.getStart()); CompressionCodecFactory factory = new CompressionCodecFactory(conf); CompressionCodec codec = factory.getCodec(file); if (codec != null) { Decompressor decompressor = CodecPool.getDecompressor(codec); in = codec.createInputStream(in, decompressor); log.info("The compression codec is " + codec); } else { log.info("There is no (or no compatible) compression. Installed codecs: " + CompressionCodecFactory.getCodecClasses(conf)); } Constructor<? extends Stream> constructor = streamClass.getDeclaredConstructor(InputStream.class); stream = constructor.newInstance(in); stream.init(); }
private static List<String> readLines(Path location, Configuration conf) throws Exception { FileSystem fileSystem = HadoopUtil.getWorkingFileSystem(); CompressionCodecFactory factory = new CompressionCodecFactory(conf); FileStatus[] items = fileSystem.listStatus(location); if (items == null) return new ArrayList<String>(); List<String> results = new ArrayList<String>(); for (FileStatus item : items) { // ignoring files like _SUCCESS if (item.getPath().getName().startsWith("_")) { continue; } CompressionCodec codec = factory.getCodec(item.getPath()); InputStream stream = null; // check if we have a compression codec we need to use if (codec != null) { stream = codec.createInputStream(fileSystem.open(item.getPath())); } else { stream = fileSystem.open(item.getPath()); } StringWriter writer = new StringWriter(); IOUtils.copy(stream, writer, "UTF-8"); String raw = writer.toString(); for (String str : StringUtil.split(raw, "\n")) { results.add(str); } } return results; }