org.apache.avro.mapred.AvroJob java code examples

Refine search

@SuppressWarnings("unchecked")
public RecordWriter<TetherData, NullWritable>
 getRecordWriter(FileSystem ignore, JobConf job,
         String name, Progressable prog)
 throws IOException {
 Schema schema = AvroJob.getOutputSchema(job);
 final DataFileWriter writer = new DataFileWriter(new GenericDatumWriter());
 if (FileOutputFormat.getCompressOutput(job)) {
  int level = job.getInt(AvroOutputFormat.DEFLATE_LEVEL_KEY,
              CodecFactory.DEFAULT_DEFLATE_LEVEL);
  writer.setCodec(CodecFactory.deflateCodec(level));
 }
 Path path =
  FileOutputFormat.getTaskOutputPath(job, name+AvroOutputFormat.EXT);
 writer.create(schema, path.getFileSystem(job).create(path));
 return new RecordWriter<TetherData, NullWritable>() {
   public void write(TetherData datum, NullWritable ignore)
    throws IOException {
    writer.appendEncoded(datum.buffer());
   }
   public void close(Reporter reporter) throws IOException {
    writer.close();
   }
  };
}

throws IOException {
boolean isMapOnly = job.getNumReduceTasks() == 0;
final Schema schema = isMapOnly
 ? AvroJob.getMapOutputSchema(job)
 : AvroJob.getOutputSchema(job);
final Path dir = FileOutputFormat.getTaskOutputPath(job, name);
final FileSystem fs = dir.getFileSystem(job);
if (!fs.mkdirs(dir))
 throw new IOException("Failed to create directory: " + dir);
final long blockSize = fs.getDefaultBlockSize();

JobConf conf = prepareJobConf(baseJobConf);
FileSystem fs = outputDir.getFileSystem(conf);
if(fs.exists(outputDir)) {
  info("Deleting previous output in " + outputDir + " for building store " + this.storeDef.getName());
  fs.delete(outputDir, true);
conf.setInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE);
conf.set("cluster.xml", new ClusterMapper().writeCluster(cluster));
conf.set("stores.xml",
     new StoreDefinitionsMapper().writeStoreList(Collections.singletonList(storeDef)));
conf.setBoolean(VoldemortBuildAndPushJob.SAVE_KEYS, saveKeys);
FileSystem outputFs = outputDir.getFileSystem(conf);
if(outputFs.exists(outputDir)) {
  throw new IOException("Final output directory already exists.");
  AvroJob.setInputSchema(conf, Schema.parse(baseJobConf.get(AVRO_REC_SCHEMA)));
  AvroJob.setOutputSchema(conf,
              Pair.getPairSchema(Schema.create(Schema.Type.BYTES),
                        Schema.create(Schema.Type.BYTES)));
  AvroJob.setMapperClass(conf, mapperClass);
  conf.setReducerClass(AvroStoreBuilderReducer.class);

/** Returns the specified output serializer. */
public Serializer<AvroWrapper<T>> getSerializer(Class<AvroWrapper<T>> c) {
 // AvroWrapper used for final output, AvroKey or AvroValue for map output
 boolean isFinalOutput = c.equals(AvroWrapper.class);
 Configuration conf = getConf();
 Schema schema = isFinalOutput
  ? AvroJob.getOutputSchema(conf)
  : (AvroKey.class.isAssignableFrom(c)
    ? Pair.getKeySchema(AvroJob.getMapOutputSchema(conf))
    : Pair.getValueSchema(AvroJob.getMapOutputSchema(conf)));
 GenericData dataModel = AvroJob.createDataModel(conf);
 return new AvroWrapperSerializer(dataModel.createDatumWriter(schema));
}

/** Returns the specified map output deserializer.  Defaults to the final
 * output deserializer if no map output schema was specified. */
public Deserializer<AvroWrapper<T>> getDeserializer(Class<AvroWrapper<T>> c) {
 Configuration conf = getConf();
 boolean isKey = AvroKey.class.isAssignableFrom(c);
 Schema schema = isKey
  ? Pair.getKeySchema(AvroJob.getMapOutputSchema(conf))
  : Pair.getValueSchema(AvroJob.getMapOutputSchema(conf));
 GenericData dataModel = AvroJob.createMapOutputDataModel(conf);
 DatumReader<T> datumReader = dataModel.createDatumReader(schema);
 return new AvroWrapperDeserializer(datumReader, isKey);
}

 @SuppressWarnings({"unchecked", "deprecation"})
 public RecordWriter<Object, Object> getRecordWriter(FileSystem fs,JobConf job, String baseFileName, Progressable arg3) throws IOException {
 String nameOutput = job.get(CONFIG_NAMED_OUTPUT, null);
 String fileName = getUniqueName(job, baseFileName);
 Schema schema = null;
 String schemastr = job.get(MO_PREFIX+nameOutput+".schema",null);
 if (schemastr!=null)
  schema = Schema.parse(schemastr);
 JobConf outputConf = new JobConf(job);
 outputConf.setOutputFormat(getNamedOutputFormatClass(job, nameOutput));
 boolean isMapOnly = job.getNumReduceTasks() == 0;
 if (schema != null) {
  if (isMapOnly)
   AvroJob.setMapOutputSchema(outputConf, schema);
  else
   AvroJob.setOutputSchema(outputConf, schema);
 }
 OutputFormat outputFormat = outputConf.getOutputFormat();
 return outputFormat.getRecordWriter(fs, outputConf, fileName, arg3);
 }
}

@Test
public void testNonAvroMapOnly() throws Exception {
 JobConf job = new JobConf();
 Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath());
 outputPath.getFileSystem(job).delete(outputPath);
 // configure input for non-Avro sequence file
 job.setInputFormat(SequenceFileInputFormat.class);
 FileInputFormat.setInputPaths(job, file().toURI().toString());
 // use a hadoop mapper that emits Avro output
 job.setMapperClass(NonAvroOnlyMapper.class);
 // configure output for avro
 job.setNumReduceTasks(0);                     // map-only
 FileOutputFormat.setOutputPath(job, outputPath);
 AvroJob.setOutputSchema(job, SCHEMA);
 JobClient.runJob(job);
 checkFile(new DataFileReader<>
      (new File(outputPath.toString() + "/part-00000.avro"),
       new SpecificDatumReader<>()));
}

@Test
@SuppressWarnings("deprecation")
public void testSort() throws Exception {
 JobConf job = new JobConf();
 String inDir = "../../../share/test/data";
 Path input = new Path(inDir+"/weather.avro");
 Path output = new Path("target/test/weather-sort");
 output.getFileSystem(job).delete(output);
 job.setJobName("sort weather");
 AvroJob.setInputSchema(job, Weather.SCHEMA$);
 AvroJob.setMapOutputSchema
  (job, Pair.getPairSchema(Weather.SCHEMA$, Schema.create(Type.NULL)));
 AvroJob.setOutputSchema(job, Weather.SCHEMA$);
 AvroJob.setMapperClass(job, SortMapper.class);
 AvroJob.setReducerClass(job, SortReducer.class);
 FileOutputFormat.setOutputPath(job, output);
 FileOutputFormat.setCompressOutput(job, true);
 AvroJob.setOutputCodec(job, SNAPPY_CODEC);

@SuppressWarnings("deprecation")
public void testJob(String pathOut) throws Exception {
 JobConf job = new JobConf();
 String pathIn = INPUT_DIR.getRoot().getPath();
 WordCountUtil.writeLinesFile(pathIn + "/lines.avro");
 Path outputPath = new Path(pathOut);
 outputPath.getFileSystem(job).delete(outputPath);
 job.setJobName("wordcount");
 AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING));
 AvroJob.setOutputSchema(job, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema());
 AvroJob.setMapperClass(job, MapImpl.class);
 AvroJob.setCombinerClass(job, ReduceImpl.class);
 AvroJob.setReducerClass(job, ReduceImpl.class);
 FileInputFormat.setInputPaths(job, new Path(pathIn));
 FileOutputFormat.setOutputPath(job, new Path(pathOut));
 FileOutputFormat.setCompressOutput(job, true);
 WordCountUtil.setMeta(job);
 JobClient.runJob(job);
 WordCountUtil.validateCountsFile(new File(pathOut, "part-00000.avro"));
}

@Test
public void testNonAvroReducer() throws Exception {
 JobConf job = new JobConf();
 Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath());
 outputPath.getFileSystem(job).delete(outputPath);
 // configure input for Avro from sequence file
 AvroJob.setInputSequenceFile(job);
 AvroJob.setInputSchema(job, SCHEMA);
 FileInputFormat.setInputPaths(job, file().toURI().toString());
 // mapper is default, identity
 // use a hadoop reducer that consumes Avro input
 AvroJob.setMapOutputSchema(job, SCHEMA);
 job.setReducerClass(NonAvroReducer.class);
 // configure outputPath for non-Avro SequenceFile
 job.setOutputFormat(SequenceFileOutputFormat.class);
 FileOutputFormat.setOutputPath(job, outputPath);
 // output key/value classes are default, LongWritable/Text
 JobClient.runJob(job);
 checkFile(new SequenceFileReader<>
      (new File(outputPath.toString() + "/part-00000")));
}

 public void testProjectionNoreducer(String inputDirectory) throws Exception {
  JobConf job = new JobConf();
  long onel = 1;
  Schema readerSchema = Schema.create(Schema.Type.STRING);
  AvroJob.setInputSchema(job, readerSchema);
  Path inputPath = new Path(inputDirectory + "/myavro2-m-00000.avro");
  FileStatus fileStatus = FileSystem.get(job).getFileStatus(inputPath);
  FileSplit fileSplit = new FileSplit(inputPath, 0, fileStatus.getLen(), job);
  AvroRecordReader<Utf8> recordReader = new AvroRecordReader<>(job, fileSplit);
  AvroWrapper<Utf8> inputPair = new AvroWrapper<>(null);
  NullWritable ignore = NullWritable.get();
  while (recordReader.next(inputPair, ignore)) {
   long testl = Long.parseLong(inputPair.datum().toString().split(":")[2].replace("}", "").trim());
   Assert.assertEquals(onel, testl);
  }
 }
}

JobConf conf = new JobConf(LabelOccurrenceStep.class);
DumpExtractor.configureJob(conf, args) ;
  throw new Exception("Waay to many distinct labels (this must be less than " + Integer.MAX_VALUE + ")") ;
conf.setInt(KEY_TOTAL_LABELS, (int)sensesStep.getTotalLabels());
conf.setJobName("WM: label occurrences");
FileInputFormat.setInputPaths(conf, conf.get(DumpExtractor.KEY_INPUT_FILE));
DistributedCache.addCacheFile(new Path(conf.get(DumpExtractor.KEY_SENTENCE_MODEL)).toUri(), conf);
DistributedCache.addCacheFile(new Path(conf.get(DumpExtractor.KEY_OUTPUT_DIR) + "/" + DumpExtractor.OUTPUT_SITEINFO).toUri(), conf);
DistributedCache.addCacheFile(new Path(conf.get(DumpExtractor.KEY_LANG_FILE)).toUri(), conf);
for (FileStatus fs:FileSystem.get(conf).listStatus(sensesStep.getDir())) {
  if (fs.getPath().getName().startsWith("part-")) {
AvroJob.setCombinerClass(conf, Combiner.class) ;
AvroJob.setReducerClass(conf, Reducer.class);
AvroJob.setOutputSchema(conf, Pair.getPairSchema(Schema.create(Type.STRING),LabelOccurrences.getClassSchema()));
FileOutputFormat.setOutputPath(conf, getDir());

@SuppressWarnings("deprecation")
public void testProjection(String inputDirectory) throws Exception {
 JobConf job = new JobConf();
         "]}";
 Schema readerSchema = Schema.parse(jsonSchema);
 AvroJob.setInputSchema(job, readerSchema);
 Path inputPath = new Path(inputDirectory + "/myavro-r-00000.avro");
 FileStatus fileStatus = FileSystem.get(job).getFileStatus(inputPath);
 FileSplit fileSplit = new FileSplit(inputPath, 0, fileStatus.getLen(), job);
 long numOfCounts = 0;
 while (recordReader.next(inputPair, ignore)) {
  Assert.assertEquals(inputPair.datum().get(0), defaultRank);
  sumOfCounts += (Long) inputPair.datum().get(1);
  numOfCounts++;

/** Uses default mapper with no reduces for a map-only identity job. */
@Test
@SuppressWarnings("deprecation")
public void testMapOnly() throws Exception {
 JobConf job = new JobConf();
 String inDir = System.getProperty("share.dir","../../../share")+"/test/data";
 Path input = new Path(inDir+"/weather.avro");
 Path output = new Path("target/test/weather-ident");
 output.getFileSystem(job).delete(output);
 job.setJobName("identity map weather");
 AvroJob.setInputSchema(job, Weather.SCHEMA$);
 AvroJob.setOutputSchema(job, Weather.SCHEMA$);
 FileInputFormat.setInputPaths(job, input);
 FileOutputFormat.setOutputPath(job, output);
 FileOutputFormat.setCompressOutput(job, true);
 job.setNumReduceTasks(0);                     // map-only
 JobClient.runJob(job);
 // check output is correct
 DatumReader<Weather> reader = new SpecificDatumReader<>();
 DataFileReader<Weather> check = new DataFileReader<>
  (new File(inDir + "/weather.avro"), reader);
 DataFileReader<Weather> sorted = new DataFileReader<>
  (new File(output.toString() + "/part-00000.avro"), reader);
 for (Weather w : sorted)
  assertEquals(check.next(), w);
 check.close();
 sorted.close();
}

@Test
public void testSequenceFileInputFormat() throws Exception {
 JobConf job = new JobConf();
 Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath());
 outputPath.getFileSystem(job).delete(outputPath);
 // configure input for Avro from sequence file
 AvroJob.setInputSequenceFile(job);
 FileInputFormat.setInputPaths(job, file().toURI().toString());
 AvroJob.setInputSchema(job, SCHEMA);
 // mapper is default, identity
 // reducer is default, identity
 // configure output for avro
 AvroJob.setOutputSchema(job, SCHEMA);
 FileOutputFormat.setOutputPath(job, outputPath);
 JobClient.runJob(job);
 checkFile(new DataFileReader<>
      (new File(outputPath.toString() + "/part-00000.avro"),
       new SpecificDatumReader<>()));
}

 @Test
  public void testJob() throws Exception {
  JobConf job = new JobConf();
  Path outputPath = new Path(DIR.getRoot().getPath() + "/out");
  outputPath.getFileSystem(job).delete(outputPath);

  job.setInputFormat(TextInputFormat.class);
  FileInputFormat.setInputPaths(job, DIR.getRoot().getPath() + "/in");

  job.setMapperClass(AvroTestConverter.class);
  job.setNumReduceTasks(0);

  FileOutputFormat.setOutputPath(job, outputPath);
  System.out.println(createSchema());
  AvroJob.setOutputSchema(job,
              Pair.getPairSchema(Schema.create(Schema.Type.LONG),
                        createSchema()));
  job.setOutputFormat(AvroOutputFormat.class);

  JobClient.runJob(job);
 }
}

@Test
public void testJob() throws Exception {
 JobConf job = new JobConf();
 Path inputPath1 = new Path(INPUT_DIR_1.getRoot().getPath());
 Path inputPath2 = new Path(INPUT_DIR_2.getRoot().getPath());
 Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath());
 outputPath.getFileSystem(job).delete(outputPath);
 writeNamesFiles(new File(inputPath1.toUri().getPath()));
 writeBalancesFiles(new File(inputPath2.toUri().getPath()));
 job.setJobName("multiple-inputs-join");
 AvroMultipleInputs.addInputPath(job, inputPath1, NamesMapImpl.class,
     ReflectData.get().getSchema(NamesRecord.class));
 AvroMultipleInputs.addInputPath(job, inputPath2, BalancesMapImpl.class,
     ReflectData.get().getSchema(BalancesRecord.class));
 Schema keySchema = ReflectData.get().getSchema(KeyRecord.class);
 Schema valueSchema = ReflectData.get().getSchema(JoinableRecord.class);
 AvroJob.setMapOutputSchema(job,
     Pair.getPairSchema(keySchema, valueSchema));
 AvroJob.setOutputSchema(job,
     ReflectData.get().getSchema(CompleteRecord.class));
 AvroJob.setReducerClass(job, ReduceImpl.class);
 job.setNumReduceTasks(1);
 FileOutputFormat.setOutputPath(job, outputPath);
 AvroJob.setReflect(job);
 JobClient.runJob(job);
 validateCompleteFile(new File(OUTPUT_DIR.getRoot(), "part-00000.avro"));
}

File inputPath = new File(INPUT_DIR.getRoot(), "lines.avro");
JobConf job = new JobConf();
Path outputPath = new Path(outputPathStr);
outputPath.getFileSystem(job).delete(outputPath);
FileOutputFormat.setOutputPath(job, outputPath);
TetherJob.setExecutable(job, exec, execargs, false);
Schema outscheme = new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema();
AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING));
job.set(AvroJob.OUTPUT_SCHEMA, outscheme.toString());
int numWords = 0;
for (Pair<Utf8, Long> wc : counts) {
 assertEquals(wc.key().toString(), WordCountUtil.COUNTS.get(wc.key().toString()), wc.value());
 numWords++;

@Test
@SuppressWarnings("deprecation")
public void testJob() throws Exception {
 JobConf job = new JobConf();
 String dir = "target/testReflectJob";
 Path inputPath = new Path(dir + "/in");
 Path outputPath = new Path(dir + "/out");
 outputPath.getFileSystem(job).delete(outputPath);
 inputPath.getFileSystem(job).delete(inputPath);
 writeLinesFile(new File(dir+"/in"));
 job.setJobName("reflect");
 AvroJob.setInputSchema(job, ReflectData.get().getSchema(Text.class));
 AvroJob.setMapOutputSchema
  (job, new Pair(new Text(""), new Count(0L)).getSchema());
 AvroJob.setOutputSchema(job, ReflectData.get().getSchema(WordCount.class));
 AvroJob.setMapperClass(job, MapImpl.class);
 //AvroJob.setCombinerClass(job, ReduceImpl.class);
 AvroJob.setReducerClass(job, ReduceImpl.class);
 FileInputFormat.setInputPaths(job, inputPath);
 FileOutputFormat.setOutputPath(job, outputPath);
 AvroJob.setReflect(job); // use reflection
 JobClient.runJob(job);
 validateCountsFile(new File(new File(dir, "out"), "part-00000.avro"));
}

public static void runJob(String mysqlJar, String output)
  throws Exception {
 Configuration conf = new Configuration();
 JobHelper.addJarForJob(conf, mysqlJar);
 DBConfiguration.configureDB(conf, "com.mysql.jdbc.Driver",
   "jdbc:mysql://localhost/sqoop_test" +
     "?user=hip_sqoop_user&password=password");
 JobConf job = new JobConf(conf);
 job.setJarByClass(DBExportMapReduce.class);
 Path outputPath = new Path(output);
 outputPath.getFileSystem(job).delete(outputPath, true);
 job.set(AvroJob.INPUT_SCHEMA, Stock.SCHEMA$.toString());
 job.setInputFormat(AvroInputFormat.class);
 job.setInputFormat(DBInputFormat.class);
 job.setOutputFormat(DBOutputFormat.class);
 AvroJob.setOutputSchema(job, Stock.SCHEMA$);
 job.setMapperClass(Map.class);
 job.setMapOutputKeyClass(StockRecord.class);
 job.setMapOutputValueClass(NullWritable.class);
 job.setOutputKeyClass(StockRecord.class);
 job.setOutputValueClass(NullWritable.class);
 job.setNumReduceTasks(4);
 DBOutputFormat.setOutput(
   job,
   "stocks_export",
   StockRecord.fields);
 JobClient.runJob(job);
}

Javadoc

Setters to configure jobs for Avro data.

Most used methods

getOutputSchema
Return a job's output key schema.
getMapOutputSchema
Return a job's map output key schema.
setInputSchema
Configure a job's map input schema.
setOutputSchema
Configure a job's output schema. Unless this is a map-only job, this must be a Pair schema.
setMapperClass
Configure a job's mapper implementation.
setReducerClass
Configure a job's reducer implementation.
getInputSchema
Return a job's map input schema.
setCombinerClass
Configure a job's combiner implementation.
setMapOutputSchema
Configure a job's map output schema. The map output schema defaults to the output schema and need on
setOutputCodec
Configure a job's output compression codec.
configureAvroInput
configureAvroJob

Popular in Java

Updating database using SQL prepared statement
getSharedPreferences (Context)
getExternalFilesDir (Context)
addToBackStack (FragmentTransaction)
URLEncoder (java.net)
This class is used to encode a string using the format required by application/x-www-form-urlencoded
Connection (java.sql)
A connection represents a link from a Java application to a database. All SQL statements and results
Queue (java.util)
A collection designed for holding elements prior to processing. Besides basic java.util.Collection o
Logger (org.apache.log4j)
This is the central class in the log4j package. Most logging operations, except configuration, are d
Component (java.awt)
A component is an object having a graphical representation that can be displayed on the screen and t
Kernel (java.awt.image)
Top plugins for Android Studio

How to useAvroJob in org.apache.avro.mapred

Best Java code snippets using org.apache.avro.mapred.AvroJob (Showing top 20 results out of 315)

Refine search

How to use
AvroJob
in
org.apache.avro.mapred