static <T> void configureDataFileWriter(DataFileWriter<T> writer, JobConf job) throws UnsupportedEncodingException { CodecFactory factory = getCodecFactory(job); if (factory != null) { writer.setCodec(factory); } writer.setSyncInterval(job.getInt(SYNC_INTERVAL_KEY, DEFAULT_SYNC_INTERVAL)); // copy metadata from job for (Map.Entry<String,String> e : job) { if (e.getKey().startsWith(AvroJob.TEXT_PREFIX)) writer.setMeta(e.getKey().substring(AvroJob.TEXT_PREFIX.length()), e.getValue()); if (e.getKey().startsWith(AvroJob.BINARY_PREFIX)) writer.setMeta(e.getKey().substring(AvroJob.BINARY_PREFIX.length()), URLDecoder.decode(e.getValue(), StandardCharsets.ISO_8859_1.name()) .getBytes(StandardCharsets.ISO_8859_1)); } }
@Override public RecordWriter<K, V> getRecordWriter(FileSystem ignore, JobConf job, String name, Progressable prog) throws IOException { Schema schema = Schema.create(Schema.Type.BYTES); final byte[] keyValueSeparator = job.get("mapreduce.output.textoutputformat.separator", "\t").getBytes(UTF8); final DataFileWriter<ByteBuffer> writer = new DataFileWriter<>(new ReflectDatumWriter<>()); AvroOutputFormat.configureDataFileWriter(writer, job); Path path = FileOutputFormat.getTaskOutputPath(job, name+EXT); writer.create(schema, path.getFileSystem(job).create(path)); return new AvroTextRecordWriter(writer, keyValueSeparator); }
@Test public void testSetSyncInterval() { JobConf jobConf = new JobConf(); int newSyncInterval = 100000; AvroOutputFormat.setSyncInterval(jobConf, newSyncInterval); assertEquals(newSyncInterval, jobConf.getInt( AvroOutputFormat.SYNC_INTERVAL_KEY, -1)); }
private JobConf createJobConfig() throws IOException { Path inputPath = new Path(INPUT_PATH); Path outputPath = new Path(OUTPUT_PATH); FileSystem.get(new Configuration()).delete(outputPath, true); JobConf jobConfig = new JobConf(); jobConfig.setInputFormat(AvroInputFormat.class); jobConfig.setOutputFormat(AvroOutputFormat.class); AvroOutputFormat.setOutputPath(jobConfig, outputPath); AvroInputFormat.addInputPath(jobConfig, inputPath); jobConfig.set(AvroJob.OUTPUT_SCHEMA, User.SCHEMA.toString()); jobConfig.set(AvroJob.INPUT_SCHEMA, User.SCHEMA.toString()); return jobConfig; }
conf.setOutputFormat(AvroOutputFormat.class); AvroOutputFormat.setDeflateLevel(conf, 9); AvroOutputFormat.setOutputPath(conf, path);
/** * Run an avro hadoop job with job conf * @param conf * @throws Exception */ public static void runAvroJob(JobConf conf) throws Exception { Path[] inputPaths = AvroInputFormat.getInputPaths(conf); _log.info("Running hadoop job with input paths:"); for (Path inputPath : inputPaths) { _log.info(inputPath); } _log.info("Output path="+AvroOutputFormat.getOutputPath(conf)); Job job = new Job(conf); job.setJarByClass(AvroUtils.class); job.waitForCompletion(true); }
props.put(AbstractAvroJob.OUTPUT_PATH, outPath); conf = createJobConf(AdmmTestMapper.class, AdmmTestReducer.class); AvroOutputFormat.setOutputPath(conf, new Path(outPath)); String modelPath = props.getString(MODEL_BASE_PATH); modelPath = modelPath + "/final-model"; props.put(AbstractAvroJob.OUTPUT_PATH, outPath); conf = createJobConf(AdmmTestMapper.class, AdmmTestReducer.class); AvroOutputFormat.setOutputPath(conf, new Path(outPath)); AvroUtils.addAvroCacheFiles(conf, new Path(modelPath)); conf.set(MODEL_PATH, modelPath);
@Test public void testNoCodec() { JobConf job = new JobConf(); assertNull(AvroOutputFormat.getCodecFactory(job)); job = new JobConf(); job.set("mapred.output.compress", "false"); job.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.BZip2Codec"); assertNull(AvroOutputFormat.getCodecFactory(job)); job = new JobConf(); job.set("mapred.output.compress", "false"); job.set(AvroJob.OUTPUT_CODEC, "bzip2"); assertNull(AvroOutputFormat.getCodecFactory(job)); }
RegressionTestLoglikOutput.SCHEMA$); _logger.info("Computing loglik..."); AvroOutputFormat.setOutputPath(conf, new Path(outPath)); AvroJob.setInputSchema(conf, AvroUtils.getAvroInputSchema(conf)); AvroUtils.runAvroJob(conf); RegressionTestLoglikOutput.SCHEMA$); _logger.info("Computing loglik..."); AvroOutputFormat.setOutputPath(conf, new Path(outPath)); AvroJob.setInputSchema(conf, AvroUtils.getAvroInputSchema(conf)); AvroUtils.runAvroJob(conf);
@Override public RecordWriter<AvroWrapper<T>, NullWritable> getRecordWriter(FileSystem ignore, JobConf job, String name, Progressable prog) throws IOException { boolean isMapOnly = job.getNumReduceTasks() == 0; Schema schema = isMapOnly ? AvroJob.getMapOutputSchema(job) : AvroJob.getOutputSchema(job); GenericData dataModel = AvroJob.createDataModel(job); final DataFileWriter<T> writer = new DataFileWriter<T>(dataModel.createDatumWriter(null)); configureDataFileWriter(writer, job); Path path = FileOutputFormat.getTaskOutputPath(job, name+EXT); writer.create(schema, path.getFileSystem(job).create(path)); return new RecordWriter<AvroWrapper<T>, NullWritable>() { public void write(AvroWrapper<T> wrapper, NullWritable ignore) throws IOException { writer.append(wrapper.datum()); } public void close(Reporter reporter) throws IOException { writer.close(); } }; }
@Test public void testBZip2CodecUsingHadoopClass() { CodecFactory avroBZip2Codec = CodecFactory.fromString("bzip2"); JobConf job = new JobConf(); job.set("mapred.output.compress", "true"); job.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.BZip2Codec"); CodecFactory factory = AvroOutputFormat.getCodecFactory(job); assertNotNull(factory); assertEquals(factory.getClass(), avroBZip2Codec.getClass()); }
+ String.valueOf(i) + " is: " + String.valueOf(liblinearEpsilon)); _logger.info("aggressiveLiblinearEpsilonDecay="+aggressiveLiblinearEpsilonDecay); AvroOutputFormat.setOutputPath(conf, new Path(outpath)); AvroUtils.addAvroCacheFiles(conf, new Path(uPath)); AvroUtils.addAvroCacheFiles(conf, new Path(zPath));
@Override public RecordWriter<K, V> getRecordWriter(FileSystem ignore, JobConf job, String name, Progressable prog) throws IOException { Schema schema = Schema.create(Schema.Type.BYTES); final byte[] keyValueSeparator = job.get("mapreduce.output.textoutputformat.separator", "\t").getBytes(UTF8); final DataFileWriter<ByteBuffer> writer = new DataFileWriter<ByteBuffer>(new ReflectDatumWriter<ByteBuffer>()); AvroOutputFormat.configureDataFileWriter(writer, job); Path path = FileOutputFormat.getTaskOutputPath(job, name+EXT); writer.create(schema, path.getFileSystem(job).create(path)); return new AvroTextRecordWriter(writer, keyValueSeparator); }
@Test public void testSnappyCodecUsingHadoopClass() { CodecFactory avroSnappyCodec = CodecFactory.fromString("snappy"); JobConf job = new JobConf(); job.set("mapred.output.compress", "true"); job.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec"); CodecFactory factory = AvroOutputFormat.getCodecFactory(job); assertNotNull(factory); assertEquals(factory.getClass(), avroSnappyCodec.getClass()); }
@Override public RecordWriter<AvroWrapper<T>, NullWritable> getRecordWriter(FileSystem ignore, JobConf job, String name, Progressable prog) throws IOException { boolean isMapOnly = job.getNumReduceTasks() == 0; Schema schema = isMapOnly ? AvroJob.getMapOutputSchema(job) : AvroJob.getOutputSchema(job); GenericData dataModel = AvroJob.createDataModel(job); final DataFileWriter<T> writer = new DataFileWriter<T>(dataModel.createDatumWriter(null)); configureDataFileWriter(writer, job); Path path = FileOutputFormat.getTaskOutputPath(job, name+EXT); writer.create(schema, path.getFileSystem(job).create(path)); return new RecordWriter<AvroWrapper<T>, NullWritable>() { public void write(AvroWrapper<T> wrapper, NullWritable ignore) throws IOException { writer.append(wrapper.datum()); } public void close(Reporter reporter) throws IOException { writer.close(); } }; }
@Test public void testSnappyCodecUsingAvroCodec() { CodecFactory avroSnappyCodec = CodecFactory.fromString("snappy"); JobConf job = new JobConf(); job.set("mapred.output.compress", "true"); job.set(AvroJob.OUTPUT_CODEC, "snappy"); CodecFactory factory = AvroOutputFormat.getCodecFactory(job); assertNotNull(factory); assertEquals(factory.getClass(), avroSnappyCodec.getClass()); }
@Test public void testBZip2CodecUsingAvroCodec() { CodecFactory avroBZip2Codec = CodecFactory.fromString("bzip2"); JobConf job = new JobConf(); job.set("mapred.output.compress", "true"); job.set(AvroJob.OUTPUT_CODEC, "bzip2"); CodecFactory factory = AvroOutputFormat.getCodecFactory(job); assertNotNull(factory); assertEquals(factory.getClass(), avroBZip2Codec.getClass()); }
@Test public void testDeflateCodecUsingAvroCodec() { CodecFactory avroDeflateCodec = CodecFactory.fromString("deflate"); JobConf job = new JobConf(); job.set("mapred.output.compress", "true"); job.set(AvroJob.OUTPUT_CODEC, "deflate"); CodecFactory factory = AvroOutputFormat.getCodecFactory(job); assertNotNull(factory); assertEquals(factory.getClass(), avroDeflateCodec.getClass()); }
@Test public void testDeflateCodecUsingHadoopClass() { CodecFactory avroDeflateCodec = CodecFactory.fromString("deflate"); JobConf job = new JobConf(); job.set("mapred.output.compress", "true"); job.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.DeflateCodec"); CodecFactory factory = AvroOutputFormat.getCodecFactory(job); assertNotNull(factory); assertEquals(factory.getClass(), avroDeflateCodec.getClass()); }
@Test public void testGZipCodecUsingHadoopClass() { CodecFactory avroDeflateCodec = CodecFactory.fromString("deflate"); JobConf job = new JobConf(); job.set("mapred.output.compress", "true"); job.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GZipCodec"); CodecFactory factory = AvroOutputFormat.getCodecFactory(job); assertNotNull(factory); assertEquals(factory.getClass(), avroDeflateCodec.getClass()); } }