Refine search
input = env.createInput(HadoopInputs.readHadoopFile(new TextInputFormat(), LongWritable.class, Text.class, textPath)); } else { input = env.createInput(readHadoopFile(new TextInputFormat(), LongWritable.class, Text.class, textPath)); Job job = Job.getInstance(); HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), job); job.getConfiguration().set("mapred.textoutputformat.separator", " "); TextOutputFormat.setOutputPath(job, new Path(resultPath));
@Override public List<InputSplit> getSplits(JobContext ctx) throws IOException { List<InputSplit> res = super.getSplits(ctx); splitsCount.set(res.size()); X.println("___ split of input: " + splitsCount.get()); return res; } }
void runMRCreateFail( String dbName, String tableName, Map<String, String> partitionValues, List<HCatFieldSchema> columns) throws Exception { Job job = new Job(mrConf, "hcat mapreduce write fail test"); job.setJarByClass(this.getClass()); job.setMapperClass(TestHCatPartitionPublish.MapFail.class); // input/output settings job.setInputFormatClass(TextInputFormat.class); Path path = new Path(fs.getWorkingDirectory(), "mapred/testHCatMapReduceInput"); // The write count does not matter, as the map will fail in its first // call. createInputFile(path, 5); TextInputFormat.setInputPaths(job, path); job.setOutputFormatClass(HCatOutputFormat.class); OutputJobInfo outputJobInfo = OutputJobInfo.create(dbName, tableName, partitionValues); HCatOutputFormat.setOutput(job, outputJobInfo); job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(DefaultHCatRecord.class); job.setNumReduceTasks(0); HCatOutputFormat.setSchema(job, new HCatSchema(columns)); boolean success = job.waitForCompletion(true); Assert.assertTrue(success == false); }
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: WordCount <input path> <result path>"); return; } final String inputPath = args[0]; final String outputPath = args[1]; final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // Set up the Hadoop Input Format Job job = Job.getInstance(); HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, job); TextInputFormat.addInputPath(job, new Path(inputPath)); // Create a Flink job with it DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat); // Tokenize the line and convert from Writable "Text" to String for better handling DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer()); // Sum up the words DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1); // Convert String back to Writable "Text" for use with Hadoop Output Format DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper()); // Set up Hadoop Output Format HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), job); hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " "); hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", " "); // set the value for both, since this test TextOutputFormat.setOutputPath(job, new Path(outputPath)); // Output & Execute hadoopResult.output(hadoopOutputFormat); env.execute("Word Count"); }
Configuration conf = new Configuration(); conf.set("hive.metastore.uris", "thrift://no.such.machine:10888"); Job job = new Job(conf, "Write-hcat-seq-table"); job.setJarByClass(TestPassProperties.class); job.setMapperClass(Map.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(DefaultHCatRecord.class); job.setInputFormatClass(TextInputFormat.class); TextInputFormat.setInputPaths(job, INPUT_FILE_NAME);
public int run(String[] args) throws IOException { Configuration conf = getConf(); Job job = new Job(conf); job.setJobName("MySQLBulkLoading"); job.setMapperClass(DelimitedLoadMapper.class); job.setJarByClass(DelimitedLoadMapper.class); job.setNumReduceTasks(0); job.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(job, new Path(args[0])); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(NullWritable.class); job.setOutputFormatClass(DBOutputFormat.class); int ret = 0; try { ret = job.waitForCompletion(true) ? 0 : 1; } catch (Exception e) { e.printStackTrace(); } return ret; }
Job countingJob = new Job(conf, "JobChaining-Counting"); countingJob.setJarByClass(BasicJobChaining.class); countingJob.setMapperClass(UserIdCountMapper.class); countingJob.setCombinerClass(LongSumReducer.class); countingJob.setReducerClass(UserIdSumReducer.class); countingJob.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(countingJob, postInput); countingJob.setOutputFormatClass(TextOutputFormat.class); binningJob.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(binningJob, outputDirIntermediate);
@Override protected void configureJob(Job job) throws IOException { Configuration conf = job.getConfiguration(); job.setJarByClass(PartialBuilder.class); FileInputFormat.setInputPaths(job, getDataPath()); FileOutputFormat.setOutputPath(job, getOutputPath(conf)); job.setOutputKeyClass(TreeID.class); job.setOutputValueClass(MapredOutput.class); job.setMapperClass(Step1Mapper.class); job.setNumReduceTasks(0); // no reducers job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); // For this implementation to work, mapred.map.tasks needs to be set to the actual // number of mappers Hadoop will use: TextInputFormat inputFormat = new TextInputFormat(); List<?> splits = inputFormat.getSplits(job); if (splits == null || splits.isEmpty()) { log.warn("Unable to compute number of splits?"); } else { int numSplits = splits.size(); log.info("Setting mapred.map.tasks = {}", numSplits); conf.setInt("mapred.map.tasks", numSplits); } }
final Job job = new Job(getConf(), "Write HDFS Index to Accumulo"); job.setJarByClass(this.getClass()); final Configuration jobConf = job.getConfiguration(); jobConf.setBoolean("mapred.map.tasks.speculative.execution", false); setVarOrders(sparql, jobConf); TextInputFormat.setInputPaths(job, inputDir); job.setInputFormatClass(TextInputFormat.class);
JobConf conf = new JobConf(); conf.set("mapreduce.framework.name", "local"); Job job = new Job(conf); TextInputFormat.setInputPaths(job, new Path(in.getPath())); TextOutputFormat.setOutputPath(job, new Path(out.getPath())); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setInputFormatClass(TextInputFormat.class); job.setMapOutputKeyClass(Text.class);
conf.set(LilyJythonMapper.BULK_MODE, Boolean.toString(bulkMode)); Job job = new Job(conf); job.setJarByClass(BulkImportTool.class); job.setMapperClass(LilyJythonMapper.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(HFileOutputFormat.class); job.setOutputValueClass(KeyValue.class); job.setJobName(formatJobName()); TextInputFormat.addInputPath(job, new Path(inputPath)); HFileOutputFormat.setOutputPath(job, tmpDir); conf.set(HFILE_PATH, tmpDir.toUri().toString());
public static void main(String [] args) throws Exception { Path outDir = new Path("output"); Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "user name check"); job.setJarByClass(UserNamePermission.class); job.setMapperClass(UserNamePermission.UserNameMapper.class); job.setCombinerClass(UserNamePermission.UserNameReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(UserNamePermission.UserNameReducer.class); job.setNumReduceTasks(1); job.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(job, new Path("input")); FileOutputFormat.setOutputPath(job, outDir); System.exit(job.waitForCompletion(true) ? 0 : 1); }
private int createParitionFile(String sequenceFileInput, String outputPath, float frequency) throws IOException, ClassNotFoundException, InterruptedException { Configuration config = getConf(); config.setFloat(SAMPLE_FREQUENCY, frequency); Job sampler = new Job(config); sampler.setInputFormatClass(TextInputFormat.class); sampler.setOutputFormatClass(TextOutputFormat.class); sampler.setOutputKeyClass(Text.class); sampler.setOutputValueClass(NullWritable.class); sampler.setNumReduceTasks(0); sampler.setMapperClass(Map.class); TextInputFormat.addInputPath(sampler, new Path(sequenceFileInput)); TextOutputFormat.setOutputPath(sampler, new Path(outputPath)); sampler.waitForCompletion(true); return 0; }
public int run(String[] args) throws Exception { Configuration conf = getConf(); populateConfiguration(args, conf); try { checkMandatoryConfs(conf); } catch (HIHOException e1) { e1.printStackTrace(); throw new Exception(e1); } Job job = new Job(conf); job.getConfiguration().setInt(MRJobConfig.NUM_MAPS, conf.getInt(HIHOConf.NUMBER_MAPPERS, 1)); job.setJobName("HihoDBExport"); job.setMapperClass(GenericDBLoadDataMapper.class); job.setJarByClass(ExportToDB.class); job.setNumReduceTasks(0); job.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(job, new Path(inputPath)); GenericDBOutputFormat.setOutput(job, tableName, columnNames); int ret = 0; try { ret = job.waitForCompletion(true) ? 0 : 1; } catch (Exception e) { e.printStackTrace(); } return ret; }
@Test public void testListStatusSimple() throws IOException { Configuration conf = new Configuration(); conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads); List<Path> expectedPaths = configureTestSimple(conf, localFs); Job job = Job.getInstance(conf); FileInputFormat<?, ?> fif = new TextInputFormat(); List<FileStatus> statuses = fif.listStatus(job); verifyFileStatuses(expectedPaths, statuses, localFs); }
private void initialiseInput(final Job job, final MapReduce operation) throws IOException { job.setInputFormatClass(TextInputFormat.class); for (final Map.Entry<String, String> entry : operation.getInputMapperPairs().entrySet()) { if (entry.getValue().contains(job.getConfiguration().get(MAPPER_GENERATOR))) { TextInputFormat.addInputPath(job, new Path(entry.getKey())); } } } }
private int createParitionFile(String inputPath, String outputFile, float frequency, int samplesCnt) throws IOException, ClassNotFoundException, InterruptedException { Path input = new Path(inputPath); Job sampler = new Job(getConf()); TextInputFormat.addInputPath(sampler, input); InputSampler.Sampler<LongWritable, Text> inputSampler = new InputSampler.RandomSampler<LongWritable, Text>(frequency, samplesCnt); Path partitionFile = new Path(outputFile); TotalOrderPartitioner.setPartitionFile(sampler.getConfiguration(), partitionFile); InputSampler.writePartitionFile(sampler, inputSampler); return 0; }
public static Path compressAndIndex(Path file, Configuration conf) throws IOException { Configuration tmpConfig = new Configuration(conf); tmpConfig.setLong("dfs.block.size", 512); tmpConfig.setInt(LzoCodec.LZO_BUFFER_SIZE_KEY, 512); Path compressedFile = LzopFileReadWrite.compress(file, tmpConfig); compressedFile.getFileSystem(tmpConfig).delete(new Path( compressedFile.toString() + LzoIndex.LZO_INDEX_SUFFIX), false); new LzoIndexer(tmpConfig).index(compressedFile); LzoIndex index = LzoIndex .readIndex(compressedFile.getFileSystem(tmpConfig), compressedFile); for (int i = 0; i < index.getNumberOfBlocks(); i++) { System.out.println("block[" + i + "] = " + index.getPosition(i)); } Job job = new Job(conf); job.setInputFormatClass(LzoTextInputFormat.class); LzoTextInputFormat inputFormat = new LzoTextInputFormat(); TextInputFormat.setInputPaths(job, compressedFile); List<InputSplit> is = inputFormat.getSplits(job); System.out.println("input splits = " + is.size()); return compressedFile; }
@Test public void testNumInputFiles() throws Exception { Configuration conf = spy(new Configuration()); Job job = make(stub(Job.class).returning(conf).from.getConfiguration()); FileStatus stat = make(stub(FileStatus.class).returning(0L).from.getLen()); TextInputFormat ispy = spy(new TextInputFormat()); doReturn(Arrays.asList(stat)).when(ispy).listStatus(job); ispy.getSplits(job); verify(conf).setLong(FileInputFormat.NUM_INPUT_FILES, 1); } }
@Test public void testSplitLocationInfo() throws Exception { Configuration conf = getConfiguration(); conf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR, "test:///a1/a2"); Job job = Job.getInstance(conf); TextInputFormat fileInputFormat = new TextInputFormat(); List<InputSplit> splits = fileInputFormat.getSplits(job); String[] locations = splits.get(0).getLocations(); Assert.assertEquals(2, locations.length); SplitLocationInfo[] locationInfo = splits.get(0).getLocationInfo(); Assert.assertEquals(2, locationInfo.length); SplitLocationInfo localhostInfo = locations[0].equals("localhost") ? locationInfo[0] : locationInfo[1]; SplitLocationInfo otherhostInfo = locations[0].equals("otherhost") ? locationInfo[0] : locationInfo[1]; Assert.assertTrue(localhostInfo.isOnDisk()); Assert.assertTrue(localhostInfo.isInMemory()); Assert.assertTrue(otherhostInfo.isOnDisk()); Assert.assertFalse(otherhostInfo.isInMemory()); }