Refine search
@SuppressWarnings("unchecked") public RecordWriter<TetherData, NullWritable> getRecordWriter(FileSystem ignore, JobConf job, String name, Progressable prog) throws IOException { Schema schema = AvroJob.getOutputSchema(job); final DataFileWriter writer = new DataFileWriter(new GenericDatumWriter()); if (FileOutputFormat.getCompressOutput(job)) { int level = job.getInt(AvroOutputFormat.DEFLATE_LEVEL_KEY, CodecFactory.DEFAULT_DEFLATE_LEVEL); writer.setCodec(CodecFactory.deflateCodec(level)); } Path path = FileOutputFormat.getTaskOutputPath(job, name+AvroOutputFormat.EXT); writer.create(schema, path.getFileSystem(job).create(path)); return new RecordWriter<TetherData, NullWritable>() { public void write(TetherData datum, NullWritable ignore) throws IOException { writer.appendEncoded(datum.buffer()); } public void close(Reporter reporter) throws IOException { writer.close(); } }; }
throws IOException { boolean isMapOnly = job.getNumReduceTasks() == 0; final Schema schema = isMapOnly ? AvroJob.getMapOutputSchema(job) : AvroJob.getOutputSchema(job); final Path dir = FileOutputFormat.getTaskOutputPath(job, name); final FileSystem fs = dir.getFileSystem(job); if (!fs.mkdirs(dir)) throw new IOException("Failed to create directory: " + dir); final long blockSize = fs.getDefaultBlockSize();
JobConf conf = prepareJobConf(baseJobConf); FileSystem fs = outputDir.getFileSystem(conf); if(fs.exists(outputDir)) { info("Deleting previous output in " + outputDir + " for building store " + this.storeDef.getName()); fs.delete(outputDir, true); conf.setInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE); conf.set("cluster.xml", new ClusterMapper().writeCluster(cluster)); conf.set("stores.xml", new StoreDefinitionsMapper().writeStoreList(Collections.singletonList(storeDef))); conf.setBoolean(VoldemortBuildAndPushJob.SAVE_KEYS, saveKeys); FileSystem outputFs = outputDir.getFileSystem(conf); if(outputFs.exists(outputDir)) { throw new IOException("Final output directory already exists."); AvroJob.setInputSchema(conf, Schema.parse(baseJobConf.get(AVRO_REC_SCHEMA))); AvroJob.setOutputSchema(conf, Pair.getPairSchema(Schema.create(Schema.Type.BYTES), Schema.create(Schema.Type.BYTES))); AvroJob.setMapperClass(conf, mapperClass); conf.setReducerClass(AvroStoreBuilderReducer.class);
/** Returns the specified output serializer. */ public Serializer<AvroWrapper<T>> getSerializer(Class<AvroWrapper<T>> c) { // AvroWrapper used for final output, AvroKey or AvroValue for map output boolean isFinalOutput = c.equals(AvroWrapper.class); Configuration conf = getConf(); Schema schema = isFinalOutput ? AvroJob.getOutputSchema(conf) : (AvroKey.class.isAssignableFrom(c) ? Pair.getKeySchema(AvroJob.getMapOutputSchema(conf)) : Pair.getValueSchema(AvroJob.getMapOutputSchema(conf))); GenericData dataModel = AvroJob.createDataModel(conf); return new AvroWrapperSerializer(dataModel.createDatumWriter(schema)); }
/** Returns the specified map output deserializer. Defaults to the final * output deserializer if no map output schema was specified. */ public Deserializer<AvroWrapper<T>> getDeserializer(Class<AvroWrapper<T>> c) { Configuration conf = getConf(); boolean isKey = AvroKey.class.isAssignableFrom(c); Schema schema = isKey ? Pair.getKeySchema(AvroJob.getMapOutputSchema(conf)) : Pair.getValueSchema(AvroJob.getMapOutputSchema(conf)); GenericData dataModel = AvroJob.createMapOutputDataModel(conf); DatumReader<T> datumReader = dataModel.createDatumReader(schema); return new AvroWrapperDeserializer(datumReader, isKey); }
@SuppressWarnings({"unchecked", "deprecation"}) public RecordWriter<Object, Object> getRecordWriter(FileSystem fs,JobConf job, String baseFileName, Progressable arg3) throws IOException { String nameOutput = job.get(CONFIG_NAMED_OUTPUT, null); String fileName = getUniqueName(job, baseFileName); Schema schema = null; String schemastr = job.get(MO_PREFIX+nameOutput+".schema",null); if (schemastr!=null) schema = Schema.parse(schemastr); JobConf outputConf = new JobConf(job); outputConf.setOutputFormat(getNamedOutputFormatClass(job, nameOutput)); boolean isMapOnly = job.getNumReduceTasks() == 0; if (schema != null) { if (isMapOnly) AvroJob.setMapOutputSchema(outputConf, schema); else AvroJob.setOutputSchema(outputConf, schema); } OutputFormat outputFormat = outputConf.getOutputFormat(); return outputFormat.getRecordWriter(fs, outputConf, fileName, arg3); } }
@Test public void testNonAvroMapOnly() throws Exception { JobConf job = new JobConf(); Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath()); outputPath.getFileSystem(job).delete(outputPath); // configure input for non-Avro sequence file job.setInputFormat(SequenceFileInputFormat.class); FileInputFormat.setInputPaths(job, file().toURI().toString()); // use a hadoop mapper that emits Avro output job.setMapperClass(NonAvroOnlyMapper.class); // configure output for avro job.setNumReduceTasks(0); // map-only FileOutputFormat.setOutputPath(job, outputPath); AvroJob.setOutputSchema(job, SCHEMA); JobClient.runJob(job); checkFile(new DataFileReader<> (new File(outputPath.toString() + "/part-00000.avro"), new SpecificDatumReader<>())); }
@Test @SuppressWarnings("deprecation") public void testSort() throws Exception { JobConf job = new JobConf(); String inDir = "../../../share/test/data"; Path input = new Path(inDir+"/weather.avro"); Path output = new Path("target/test/weather-sort"); output.getFileSystem(job).delete(output); job.setJobName("sort weather"); AvroJob.setInputSchema(job, Weather.SCHEMA$); AvroJob.setMapOutputSchema (job, Pair.getPairSchema(Weather.SCHEMA$, Schema.create(Type.NULL))); AvroJob.setOutputSchema(job, Weather.SCHEMA$); AvroJob.setMapperClass(job, SortMapper.class); AvroJob.setReducerClass(job, SortReducer.class); FileOutputFormat.setOutputPath(job, output); FileOutputFormat.setCompressOutput(job, true); AvroJob.setOutputCodec(job, SNAPPY_CODEC);
@SuppressWarnings("deprecation") public void testJob(String pathOut) throws Exception { JobConf job = new JobConf(); String pathIn = INPUT_DIR.getRoot().getPath(); WordCountUtil.writeLinesFile(pathIn + "/lines.avro"); Path outputPath = new Path(pathOut); outputPath.getFileSystem(job).delete(outputPath); job.setJobName("wordcount"); AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputSchema(job, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema()); AvroJob.setMapperClass(job, MapImpl.class); AvroJob.setCombinerClass(job, ReduceImpl.class); AvroJob.setReducerClass(job, ReduceImpl.class); FileInputFormat.setInputPaths(job, new Path(pathIn)); FileOutputFormat.setOutputPath(job, new Path(pathOut)); FileOutputFormat.setCompressOutput(job, true); WordCountUtil.setMeta(job); JobClient.runJob(job); WordCountUtil.validateCountsFile(new File(pathOut, "part-00000.avro")); }
@Test public void testNonAvroReducer() throws Exception { JobConf job = new JobConf(); Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath()); outputPath.getFileSystem(job).delete(outputPath); // configure input for Avro from sequence file AvroJob.setInputSequenceFile(job); AvroJob.setInputSchema(job, SCHEMA); FileInputFormat.setInputPaths(job, file().toURI().toString()); // mapper is default, identity // use a hadoop reducer that consumes Avro input AvroJob.setMapOutputSchema(job, SCHEMA); job.setReducerClass(NonAvroReducer.class); // configure outputPath for non-Avro SequenceFile job.setOutputFormat(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(job, outputPath); // output key/value classes are default, LongWritable/Text JobClient.runJob(job); checkFile(new SequenceFileReader<> (new File(outputPath.toString() + "/part-00000"))); }
public void testProjectionNoreducer(String inputDirectory) throws Exception { JobConf job = new JobConf(); long onel = 1; Schema readerSchema = Schema.create(Schema.Type.STRING); AvroJob.setInputSchema(job, readerSchema); Path inputPath = new Path(inputDirectory + "/myavro2-m-00000.avro"); FileStatus fileStatus = FileSystem.get(job).getFileStatus(inputPath); FileSplit fileSplit = new FileSplit(inputPath, 0, fileStatus.getLen(), job); AvroRecordReader<Utf8> recordReader = new AvroRecordReader<>(job, fileSplit); AvroWrapper<Utf8> inputPair = new AvroWrapper<>(null); NullWritable ignore = NullWritable.get(); while (recordReader.next(inputPair, ignore)) { long testl = Long.parseLong(inputPair.datum().toString().split(":")[2].replace("}", "").trim()); Assert.assertEquals(onel, testl); } } }
JobConf conf = new JobConf(LabelOccurrenceStep.class); DumpExtractor.configureJob(conf, args) ; throw new Exception("Waay to many distinct labels (this must be less than " + Integer.MAX_VALUE + ")") ; conf.setInt(KEY_TOTAL_LABELS, (int)sensesStep.getTotalLabels()); conf.setJobName("WM: label occurrences"); FileInputFormat.setInputPaths(conf, conf.get(DumpExtractor.KEY_INPUT_FILE)); DistributedCache.addCacheFile(new Path(conf.get(DumpExtractor.KEY_SENTENCE_MODEL)).toUri(), conf); DistributedCache.addCacheFile(new Path(conf.get(DumpExtractor.KEY_OUTPUT_DIR) + "/" + DumpExtractor.OUTPUT_SITEINFO).toUri(), conf); DistributedCache.addCacheFile(new Path(conf.get(DumpExtractor.KEY_LANG_FILE)).toUri(), conf); for (FileStatus fs:FileSystem.get(conf).listStatus(sensesStep.getDir())) { if (fs.getPath().getName().startsWith("part-")) { AvroJob.setCombinerClass(conf, Combiner.class) ; AvroJob.setReducerClass(conf, Reducer.class); AvroJob.setOutputSchema(conf, Pair.getPairSchema(Schema.create(Type.STRING),LabelOccurrences.getClassSchema())); FileOutputFormat.setOutputPath(conf, getDir());
@SuppressWarnings("deprecation") public void testProjection(String inputDirectory) throws Exception { JobConf job = new JobConf(); "]}"; Schema readerSchema = Schema.parse(jsonSchema); AvroJob.setInputSchema(job, readerSchema); Path inputPath = new Path(inputDirectory + "/myavro-r-00000.avro"); FileStatus fileStatus = FileSystem.get(job).getFileStatus(inputPath); FileSplit fileSplit = new FileSplit(inputPath, 0, fileStatus.getLen(), job); long numOfCounts = 0; while (recordReader.next(inputPair, ignore)) { Assert.assertEquals(inputPair.datum().get(0), defaultRank); sumOfCounts += (Long) inputPair.datum().get(1); numOfCounts++;
/** Uses default mapper with no reduces for a map-only identity job. */ @Test @SuppressWarnings("deprecation") public void testMapOnly() throws Exception { JobConf job = new JobConf(); String inDir = System.getProperty("share.dir","../../../share")+"/test/data"; Path input = new Path(inDir+"/weather.avro"); Path output = new Path("target/test/weather-ident"); output.getFileSystem(job).delete(output); job.setJobName("identity map weather"); AvroJob.setInputSchema(job, Weather.SCHEMA$); AvroJob.setOutputSchema(job, Weather.SCHEMA$); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); FileOutputFormat.setCompressOutput(job, true); job.setNumReduceTasks(0); // map-only JobClient.runJob(job); // check output is correct DatumReader<Weather> reader = new SpecificDatumReader<>(); DataFileReader<Weather> check = new DataFileReader<> (new File(inDir + "/weather.avro"), reader); DataFileReader<Weather> sorted = new DataFileReader<> (new File(output.toString() + "/part-00000.avro"), reader); for (Weather w : sorted) assertEquals(check.next(), w); check.close(); sorted.close(); }
@Test public void testSequenceFileInputFormat() throws Exception { JobConf job = new JobConf(); Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath()); outputPath.getFileSystem(job).delete(outputPath); // configure input for Avro from sequence file AvroJob.setInputSequenceFile(job); FileInputFormat.setInputPaths(job, file().toURI().toString()); AvroJob.setInputSchema(job, SCHEMA); // mapper is default, identity // reducer is default, identity // configure output for avro AvroJob.setOutputSchema(job, SCHEMA); FileOutputFormat.setOutputPath(job, outputPath); JobClient.runJob(job); checkFile(new DataFileReader<> (new File(outputPath.toString() + "/part-00000.avro"), new SpecificDatumReader<>())); }
@Test public void testJob() throws Exception { JobConf job = new JobConf(); Path outputPath = new Path(DIR.getRoot().getPath() + "/out"); outputPath.getFileSystem(job).delete(outputPath); job.setInputFormat(TextInputFormat.class); FileInputFormat.setInputPaths(job, DIR.getRoot().getPath() + "/in"); job.setMapperClass(AvroTestConverter.class); job.setNumReduceTasks(0); FileOutputFormat.setOutputPath(job, outputPath); System.out.println(createSchema()); AvroJob.setOutputSchema(job, Pair.getPairSchema(Schema.create(Schema.Type.LONG), createSchema())); job.setOutputFormat(AvroOutputFormat.class); JobClient.runJob(job); } }
@Test public void testJob() throws Exception { JobConf job = new JobConf(); Path inputPath1 = new Path(INPUT_DIR_1.getRoot().getPath()); Path inputPath2 = new Path(INPUT_DIR_2.getRoot().getPath()); Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath()); outputPath.getFileSystem(job).delete(outputPath); writeNamesFiles(new File(inputPath1.toUri().getPath())); writeBalancesFiles(new File(inputPath2.toUri().getPath())); job.setJobName("multiple-inputs-join"); AvroMultipleInputs.addInputPath(job, inputPath1, NamesMapImpl.class, ReflectData.get().getSchema(NamesRecord.class)); AvroMultipleInputs.addInputPath(job, inputPath2, BalancesMapImpl.class, ReflectData.get().getSchema(BalancesRecord.class)); Schema keySchema = ReflectData.get().getSchema(KeyRecord.class); Schema valueSchema = ReflectData.get().getSchema(JoinableRecord.class); AvroJob.setMapOutputSchema(job, Pair.getPairSchema(keySchema, valueSchema)); AvroJob.setOutputSchema(job, ReflectData.get().getSchema(CompleteRecord.class)); AvroJob.setReducerClass(job, ReduceImpl.class); job.setNumReduceTasks(1); FileOutputFormat.setOutputPath(job, outputPath); AvroJob.setReflect(job); JobClient.runJob(job); validateCompleteFile(new File(OUTPUT_DIR.getRoot(), "part-00000.avro")); }
File inputPath = new File(INPUT_DIR.getRoot(), "lines.avro"); JobConf job = new JobConf(); Path outputPath = new Path(outputPathStr); outputPath.getFileSystem(job).delete(outputPath); FileOutputFormat.setOutputPath(job, outputPath); TetherJob.setExecutable(job, exec, execargs, false); Schema outscheme = new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema(); AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING)); job.set(AvroJob.OUTPUT_SCHEMA, outscheme.toString()); int numWords = 0; for (Pair<Utf8, Long> wc : counts) { assertEquals(wc.key().toString(), WordCountUtil.COUNTS.get(wc.key().toString()), wc.value()); numWords++;
@Test @SuppressWarnings("deprecation") public void testJob() throws Exception { JobConf job = new JobConf(); String dir = "target/testReflectJob"; Path inputPath = new Path(dir + "/in"); Path outputPath = new Path(dir + "/out"); outputPath.getFileSystem(job).delete(outputPath); inputPath.getFileSystem(job).delete(inputPath); writeLinesFile(new File(dir+"/in")); job.setJobName("reflect"); AvroJob.setInputSchema(job, ReflectData.get().getSchema(Text.class)); AvroJob.setMapOutputSchema (job, new Pair(new Text(""), new Count(0L)).getSchema()); AvroJob.setOutputSchema(job, ReflectData.get().getSchema(WordCount.class)); AvroJob.setMapperClass(job, MapImpl.class); //AvroJob.setCombinerClass(job, ReduceImpl.class); AvroJob.setReducerClass(job, ReduceImpl.class); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); AvroJob.setReflect(job); // use reflection JobClient.runJob(job); validateCountsFile(new File(new File(dir, "out"), "part-00000.avro")); }
public static void runJob(String mysqlJar, String output) throws Exception { Configuration conf = new Configuration(); JobHelper.addJarForJob(conf, mysqlJar); DBConfiguration.configureDB(conf, "com.mysql.jdbc.Driver", "jdbc:mysql://localhost/sqoop_test" + "?user=hip_sqoop_user&password=password"); JobConf job = new JobConf(conf); job.setJarByClass(DBExportMapReduce.class); Path outputPath = new Path(output); outputPath.getFileSystem(job).delete(outputPath, true); job.set(AvroJob.INPUT_SCHEMA, Stock.SCHEMA$.toString()); job.setInputFormat(AvroInputFormat.class); job.setInputFormat(DBInputFormat.class); job.setOutputFormat(DBOutputFormat.class); AvroJob.setOutputSchema(job, Stock.SCHEMA$); job.setMapperClass(Map.class); job.setMapOutputKeyClass(StockRecord.class); job.setMapOutputValueClass(NullWritable.class); job.setOutputKeyClass(StockRecord.class); job.setOutputValueClass(NullWritable.class); job.setNumReduceTasks(4); DBOutputFormat.setOutput( job, "stocks_export", StockRecord.fields); JobClient.runJob(job); }