public int run(String[] args) throws Exception { if(args.length != 3) Utils.croak("USAGE: GenerateData input-file output-dir value-size"); JobConf conf = new JobConf(getConf(), GenerateData.class); conf.setJobName("generate-data"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(GenerateDataMapper.class); conf.setReducerClass(IdentityReducer.class); conf.setNumReduceTasks(0); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(BytesWritable.class); conf.setOutputValueClass(BytesWritable.class); Path inputPath = new Path(args[0]); FileInputFormat.setInputPaths(conf, inputPath); Path outputPath = new Path(args[1]); // delete output path if it already exists FileSystem fs = outputPath.getFileSystem(conf); if(fs.exists(outputPath)) fs.delete(outputPath, true); FileOutputFormat.setOutputPath(conf, outputPath); conf.setInt("value.size", Integer.parseInt(args[2])); JobClient.runJob(conf); return 0; }
@Test public void testNonAvroMapper() throws Exception { JobConf job = new JobConf(); Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath()); outputPath.getFileSystem(job).delete(outputPath); // configure input for non-Avro sequence file job.setInputFormat(SequenceFileInputFormat.class); FileInputFormat.setInputPaths(job, file().toURI().toString()); // use a hadoop mapper that emits Avro output job.setMapperClass(NonAvroMapper.class); // reducer is default, identity // configure output for avro FileOutputFormat.setOutputPath(job, outputPath); AvroJob.setOutputSchema(job, SCHEMA); JobClient.runJob(job); checkFile(new DataFileReader<> (new File(outputPath.toString() + "/part-00000.avro"), new SpecificDatumReader<>())); }
@Test /** * Run the identity job on a "bytes" Avro file using AvroAsTextInputFormat * and AvroTextOutputFormat to produce a sorted "bytes" Avro file. */ public void testSort() throws Exception { JobConf job = new JobConf(); String inputPath = INPUT_DIR.getRoot().getPath(); Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath()); outputPath.getFileSystem(job).delete(outputPath); WordCountUtil.writeLinesBytesFile(inputPath); job.setInputFormat(AvroAsTextInputFormat.class); job.setOutputFormat(AvroTextOutputFormat.class); job.setOutputKeyClass(Text.class); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, outputPath); JobClient.runJob(job); WordCountUtil.validateSortedFile(outputPath.toString() + "/part-00000.avro"); }
void testInputFormat(Class<? extends InputFormat> clazz) throws IOException { Configuration conf = UTIL.getConfiguration(); final JobConf job = new JobConf(conf); job.setInputFormat(clazz); job.setOutputFormat(NullOutputFormat.class); job.setMapperClass(ExampleVerifier.class); job.setNumReduceTasks(0); LOG.debug("submitting job."); final RunningJob run = JobClient.runJob(job); assertTrue("job failed!", run.isSuccessful()); assertEquals("Saw the wrong number of instances of the filtered-for row.", 2, run.getCounters() .findCounter(TestTableInputFormat.class.getName() + ":row", "aaa").getCounter()); assertEquals("Saw any instances of the filtered out row.", 0, run.getCounters() .findCounter(TestTableInputFormat.class.getName() + ":row", "bbb").getCounter()); assertEquals("Saw the wrong number of instances of columnA.", 1, run.getCounters() .findCounter(TestTableInputFormat.class.getName() + ":family", "columnA").getCounter()); assertEquals("Saw the wrong number of instances of columnB.", 1, run.getCounters() .findCounter(TestTableInputFormat.class.getName() + ":family", "columnB").getCounter()); assertEquals("Saw the wrong count of values for the filtered-for row.", 2, run.getCounters() .findCounter(TestTableInputFormat.class.getName() + ":value", "value aaa").getCounter()); assertEquals("Saw the wrong count of values for the filtered-out row.", 0, run.getCounters() .findCounter(TestTableInputFormat.class.getName() + ":value", "value bbb").getCounter()); }
private static void runIOTest( Class<? extends Mapper> mapperClass, Path outputDir ) throws IOException { JobConf job = new JobConf(fsConfig, TestDFSIO.class); FileInputFormat.setInputPaths(job, CONTROL_DIR); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(mapperClass); job.setReducerClass(AccumulatingReducer.class); FileOutputFormat.setOutputPath(job, outputDir); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(1); JobClient.runJob(job); }
@Test public void testNonAvroMapOnly() throws Exception { JobConf job = new JobConf(); Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath()); outputPath.getFileSystem(job).delete(outputPath); // configure input for non-Avro sequence file job.setInputFormat(SequenceFileInputFormat.class); FileInputFormat.setInputPaths(job, file().toURI().toString()); // use a hadoop mapper that emits Avro output job.setMapperClass(NonAvroOnlyMapper.class); // configure output for avro job.setNumReduceTasks(0); // map-only FileOutputFormat.setOutputPath(job, outputPath); AvroJob.setOutputSchema(job, SCHEMA); JobClient.runJob(job); checkFile(new DataFileReader<> (new File(outputPath.toString() + "/part-00000.avro"), new SpecificDatumReader<>())); }
/** * Runs the demo. */ public static void main(String[] args) throws IOException { JobConf conf = new JobConf(DemoMapredNullInput.class); conf.setJobName("DemoMapredNullInput"); conf.setNumMapTasks(10); conf.setNumReduceTasks(0); conf.setInputFormat(NullInputFormat.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapperClass(MyMapper.class); JobClient.runJob(conf); } }
protected static void runIOTest( @SuppressWarnings("rawtypes") Class<? extends Mapper> mapperClass, Path outputDir, JobConf job ) throws IOException { FileInputFormat.setInputPaths(job, DfsioeConfig.getInstance().getControlDir(fsConfig)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(mapperClass); job.setReducerClass(AccumulatingReducer.class); FileOutputFormat.setOutputPath(job, outputDir); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(1); JobClient.runJob(job); }
@Test public void testJob() throws Exception { JobConf job = new JobConf(); Path outputPath = new Path(DIR.getRoot().getPath() + "/out"); outputPath.getFileSystem(job).delete(outputPath); job.setInputFormat(TextInputFormat.class); FileInputFormat.setInputPaths(job, DIR.getRoot().getPath() + "/in"); job.setMapperClass(AvroTestConverter.class); job.setNumReduceTasks(0); FileOutputFormat.setOutputPath(job, outputPath); System.out.println(createSchema()); AvroJob.setOutputSchema(job, Pair.getPairSchema(Schema.create(Schema.Type.LONG), createSchema())); job.setOutputFormat(AvroOutputFormat.class); JobClient.runJob(job); } }
public long produceSamples(Path samplePath, boolean textOutput) throws Exception { Path input = new Path(samplePath.toString() + "-seeds"); this.numSamples = writeSeeds(input); LOG.info("Generating " + this.numSamples + " of samples"); JobConf jobConf = getJobConf(); jobConf.set("genkmeansdataset.dimensions", Integer.toString(dimension)); FileInputFormat.setInputPaths(jobConf, input); FileOutputFormat.setOutputPath(jobConf, samplePath); jobConf.setMapperClass(MapClass.class); if (textOutput){ jobConf.setInputFormat(SequenceFileInputFormat.class); jobConf.setOutputFormat(TextOutputFormat.class); jobConf.setOutputKeyClass(LongWritable.class); jobConf.setOutputValueClass(VectorWritable.class); } else { jobConf.setInputFormat(SequenceFileInputFormat.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.setOutputKeyClass(LongWritable.class); jobConf.setOutputValueClass(VectorWritable.class); } jobConf.setNumReduceTasks(0); JobClient.runJob(jobConf); return this.numSamples; }
@Override protected void runJob(String jobName, Configuration c, List<Scan> scans) throws IOException, InterruptedException, ClassNotFoundException { JobConf job = new JobConf(TEST_UTIL.getConfiguration()); job.setJobName(jobName); job.setMapperClass(Mapper.class); job.setReducerClass(Reducer.class); TableMapReduceUtil.initMultiTableSnapshotMapperJob(getSnapshotScanMapping(scans), Mapper.class, ImmutableBytesWritable.class, ImmutableBytesWritable.class, job, true, restoreDir); TableMapReduceUtil.addDependencyJars(job); job.setReducerClass(Reducer.class); job.setNumReduceTasks(1); // one to get final "first" and "last" key FileOutputFormat.setOutputPath(job, new Path(job.getJobName())); LOG.info("Started " + job.getJobName()); RunningJob runningJob = JobClient.runJob(job); runningJob.waitForCompletion(); assertTrue(runningJob.isSuccessful()); LOG.info("After map/reduce completion - job " + jobName); }
public static void main(String[] args) throws Exception { long startTime = System.currentTimeMillis(); Configuration conf = new Configuration(); JobConf jobconf = new JobConf(conf, SecondarySortProjectionDriver.class); jobconf.setJobName("SecondarySortProjectionDriver"); jobconf.setMapperClass(SecondarySortProjectionMapper.class); jobconf.setReducerClass(SecondarySortProjectionReducer.class); jobconf.setOutputKeyClass(Text.class); jobconf.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(jobconf, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(jobconf, new Path(otherArgs[1])); jobconf.setInputFormat(TextInputFormat.class); jobconf.setOutputFormat(TextOutputFormat.class); jobconf.setCompressMapOutput(true); jobconf.setOutputValueGroupingComparator(NaturalKeyGroupingComparator.class); JobClient.runJob(jobconf).waitForCompletion();
@Test public void testNonAvroReducer() throws Exception { JobConf job = new JobConf(); Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath()); outputPath.getFileSystem(job).delete(outputPath); // configure input for Avro from sequence file AvroJob.setInputSequenceFile(job); AvroJob.setInputSchema(job, SCHEMA); FileInputFormat.setInputPaths(job, file().toURI().toString()); // mapper is default, identity // use a hadoop reducer that consumes Avro input AvroJob.setMapOutputSchema(job, SCHEMA); job.setReducerClass(NonAvroReducer.class); // configure outputPath for non-Avro SequenceFile job.setOutputFormat(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(job, outputPath); // output key/value classes are default, LongWritable/Text JobClient.runJob(job); checkFile(new SequenceFileReader<> (new File(outputPath.toString() + "/part-00000"))); }
JobConf job = new JobConf(PagerankData.class); String jobname = "Create pagerank links"; Path fout = new Path(options.getResultPath(), EDGES_DIR_NAME); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); job.setInputFormat(NLineInputFormat.class); job.setMapperClass(DummyToPageRankLinksMapper.class); FileOutputFormat.setOutputPath(job, fout); JobClient.runJob(job); log.info("Finished Running Job: " + jobname);
@Test public void testSequenceFileInputFormat() throws Exception { JobConf job = new JobConf(); Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath()); outputPath.getFileSystem(job).delete(outputPath); // configure input for Avro from sequence file AvroJob.setInputSequenceFile(job); FileInputFormat.setInputPaths(job, file().toURI().toString()); AvroJob.setInputSchema(job, SCHEMA); // mapper is default, identity // reducer is default, identity // configure output for avro AvroJob.setOutputSchema(job, SCHEMA); FileOutputFormat.setOutputPath(job, outputPath); JobClient.runJob(job); checkFile(new DataFileReader<> (new File(outputPath.toString() + "/part-00000.avro"), new SpecificDatumReader<>())); }
JobConf job = new JobConf(); job.setInputFormat(NLineInputFormat.class); job.setMapperClass(CreateBayesPages.class); job.setNumReduceTasks(0); FileOutputFormat.setOutputPath(job, fout); job.setOutputFormat(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); JobClient.runJob(job); log.info("Finished Running Job: " + jobname);
/** Uses default mapper with no reduces for a map-only identity job. */ @Test @SuppressWarnings("deprecation") public void testMapOnly() throws Exception { JobConf job = new JobConf(); String inDir = System.getProperty("share.dir","../../../share")+"/test/data"; Path input = new Path(inDir+"/weather.avro"); Path output = new Path("target/test/weather-ident"); output.getFileSystem(job).delete(output); job.setJobName("identity map weather"); AvroJob.setInputSchema(job, Weather.SCHEMA$); AvroJob.setOutputSchema(job, Weather.SCHEMA$); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); FileOutputFormat.setCompressOutput(job, true); job.setNumReduceTasks(0); // map-only JobClient.runJob(job); // check output is correct DatumReader<Weather> reader = new SpecificDatumReader<>(); DataFileReader<Weather> check = new DataFileReader<> (new File(inDir + "/weather.avro"), reader); DataFileReader<Weather> sorted = new DataFileReader<> (new File(output.toString() + "/part-00000.avro"), reader); for (Weather w : sorted) assertEquals(check.next(), w); check.close(); sorted.close(); }
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); JobConf jobconf = new JobConf(conf, SortByMRF_MovingAverageDriver.class); jobconf.setJobName("SortByMRF_MovingAverageDriver"); jobconf.setMapperClass(SortByMRF_MovingAverageMapper.class); jobconf.setReducerClass(SortByMRF_MovingAverageReducer.class); jobconf.setOutputKeyClass(Text.class); jobconf.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(jobconf, new Path(otherArgs[1])); FileOutputFormat.setOutputPath(jobconf, new Path(otherArgs[2])); jobconf.setInputFormat(TextInputFormat.class); jobconf.setOutputFormat(TextOutputFormat.class); jobconf.setCompressMapOutput(true); jobconf.setOutputValueGroupingComparator(NaturalKeyGroupingComparator.class); JobClient.runJob(jobconf);
@SuppressWarnings("deprecation") public void testJobNoreducer() throws Exception { JobConf job = new JobConf(); job.setNumReduceTasks(0); Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath()); outputPath.getFileSystem(job).delete(outputPath); WordCountUtil.writeLinesFile(new File(INPUT_DIR.getRoot(),"lines.avro")); job.setJobName("AvroMultipleOutputs_noreducer"); AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputSchema(job, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema()); AvroJob.setMapperClass(job, MapImpl.class); FileInputFormat.setInputPaths(job, new Path(INPUT_DIR.getRoot().toString())); FileOutputFormat.setOutputPath(job, outputPath); FileOutputFormat.setCompressOutput(job, false); AvroMultipleOutputs.addNamedOutput(job, "myavro2", AvroOutputFormat.class, Schema.create(Schema.Type.STRING)); JobClient.runJob(job); }
JobConf job = new JobConf(NutchData.class); Path urls = new Path(options.getWorkPath(), URLS_DIR_NAME); Utils.checkHdfsPath(urls); job.setInputFormat(NLineInputFormat.class); job.setMapperClass(CreateUrlHash.class); job.setNumReduceTasks(0); job.setMapOutputValueClass(Text.class); job.setOutputFormat(MapFileOutputFormat.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); MapFileOutputFormat.setOutputPath(job, urls); log.info("Pages file " + dummy.getPath() + " as input"); log.info("Rankings file " + urls + " as output"); JobClient.runJob(job); log.info("Finished Running Job: " + jobname);