Refine search
@Test public void testAvroSpecificOutput() throws Exception { Job job = new Job(); FileInputFormat.setInputPaths(job, new Path(getClass() .getResource("/org/apache/avro/mapreduce/mapreduce-test-input.txt") .toURI().toString())); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(LineCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputFormatClass(AvroKeyOutputFormat.class); Path outputPath = new Path(DIR.getRoot().getPath() + "/testAvroSpecificOutput"); outputPath.getFileSystem(job.getConfiguration()).delete(outputPath); FileOutputFormat.setOutputPath(job, outputPath); Assert.assertTrue(job.waitForCompletion(true)); FileSystem fileSystem = FileSystem.get(job.getConfiguration()); FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/myavro3-*")); Assert.assertEquals(1, outputFiles.length); Map<String, Integer> counts = new HashMap<>();
LOGGER.info("Starting {}", getClass().getSimpleName()); if (fs.exists(outputDir)) { LOGGER.warn("Found the output folder {}, deleting it", _outputDir); fs.delete(outputDir, true); fs.mkdirs(outputDir); Job job = Job.getInstance(getConf()); job.setJarByClass(SegmentCreationJob.class); job.setJobName(_jobName); job.getConfiguration().set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION")); job.setMapOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(_stagingDir + "/input/")); FileOutputFormat.setOutputPath(job, new Path(_stagingDir + "/output/")); job.getConfiguration().setInt(JobContext.NUM_MAPS, inputDataFiles.size()); if (_dataSchema != null) { job.getConfiguration().set(JobConfigConstants.SCHEMA, _dataSchema.toString()); setOutputPath(job.getConfiguration());
throw new IOException("Druid broker address not specified in configuration"); String druidQuery = StringEscapeUtils.unescapeJava(conf.get(Constants.DRUID_QUERY_JSON)); LOG.warn("Druid query is empty; creating Select query"); String dataSource = conf.get(Constants.DRUID_DATA_SOURCE); if (dataSource == null || dataSource.isEmpty()) { throw new IOException("Druid data source cannot be empty or null"); conf.set(Constants.DRUID_QUERY_TYPE, druidQueryType); } else { druidQueryType = conf.get(Constants.DRUID_QUERY_TYPE); Job job = Job.getInstance(conf); JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job); Path[] paths = FileInputFormat.getInputPaths(jobContext);
/** * Configures the Hadoop MapReduce job. * * @return Instance of the Hadoop MapRed job. * @throws IOException If failed. */ @SuppressWarnings("deprecation") private Job createConfigBasedHadoopJob() throws IOException { Job jobCfg = new Job(); Configuration cfg = jobCfg.getConfiguration(); // Use explicit configuration of distributed file system, if provided. cfg.addResource(U.resolveIgniteUrl(DFS_CFG)); jobCfg.setJobName("HadoopPopularWordExample"); jobCfg.setJarByClass(HadoopPopularWords.class); jobCfg.setInputFormatClass(TextInputFormat.class); jobCfg.setOutputKeyClass(Text.class); jobCfg.setOutputValueClass(IntWritable.class); jobCfg.setMapperClass(TokenizingMapper.class); jobCfg.setReducerClass(TopNWordsReducer.class); FileInputFormat.setInputPaths(jobCfg, BOOKS_DFS_DIR); FileOutputFormat.setOutputPath(jobCfg, RESULT_DFS_DIR); // Local job tracker allows the only task per wave, but text input format // replaces it with the calculated value based on input split size option. if ("local".equals(cfg.get("mapred.job.tracker", "local"))) { // Split job into tasks using 32MB split size. FileInputFormat.setMinInputSplitSize(jobCfg, 32L * 1024 * 1024); FileInputFormat.setMaxInputSplitSize(jobCfg, Long.MAX_VALUE); } return jobCfg; }
@Override public List<WorkUnit> getWorkunits(SourceState state) { try { Job job = Job.getInstance(new Configuration()); FileInputFormat.addInputPath(job, new Path(inputPath)); FileInputFormat<K, V> fileInputFormat = getFileInputFormat(state, job.getConfiguration()); List<InputSplit> fileSplits = fileInputFormat.getSplits(job); if (fileSplits == null || fileSplits.isEmpty()) { return ImmutableList.of(); WorkUnit workUnit = WorkUnit.create(extract); workUnit.setProp(FILE_SPLIT_BYTES_STRING_KEY, HadoopUtils.serializeToString(fileSplit)); workUnit.setProp(FILE_SPLIT_PATH_KEY, fileSplit.getPath().toString()); workUnits.add(workUnit);
/** * Prepare job with mappers to cancel. * @return Fully configured job. * @throws Exception If fails. */ private Configuration prepareJobForCancelling() throws Exception { prepareFile("/testFile", 1500); executedTasks.set(0); cancelledTasks.set(0); failMapperId.set(0); splitsCount.set(0); Configuration cfg = new Configuration(); setupFileSystems(cfg); Job job = Job.getInstance(cfg); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(CancellingTestMapper.class); job.setNumReduceTasks(0); job.setInputFormatClass(InFormat.class); FileInputFormat.setInputPaths(job, new Path("igfs://" + igfsName + "@/")); FileOutputFormat.setOutputPath(job, new Path("igfs://" + igfsName + "@/output/")); job.setJarByClass(getClass()); return job.getConfiguration(); }
throws IOException { TableName tableName = TableName.valueOf(args[0]); conf.set(TABLE_NAME, tableName.getNameAsString()); Path inputDir = new Path(args[1]); Job job = Job.getInstance(conf, conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName)); job.setJarByClass(Importer.class); FileInputFormat.setInputPaths(job, inputDir); job.setInputFormatClass(SequenceFileInputFormat.class); String hfileOutPath = conf.get(BULK_OUTPUT_CONF_KEY); job.setMapperClass(CellSortImporter.class); job.setReducerClass(CellReducer.class); Path outputDir = new Path(hfileOutPath); FileOutputFormat.setOutputPath(job, outputDir); job.setMapOutputKeyClass(CellWritableComparable.class); job.setMapOutputValueClass(MapReduceExtendedCell.class); job.setReducerClass(CellSortReducer.class); Path outputDir = new Path(hfileOutPath); FileOutputFormat.setOutputPath(job, outputDir); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(MapReduceExtendedCell.class);
@Test conf.set("hive.io.file.read.all.columns", "false"); conf.set("hive.io.file.readcolumn.ids", "1,3"); Job job = new Job(conf, "orc test"); job.setInputFormatClass(OrcNewInputFormat.class); job.setJarByClass(TestNewInputOutputFormat.class); job.setMapperClass(OrcTestMapper1.class); job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(HiveTestUtils .getFileFromClasspath("orc-file-11-format.orc"))); Path outputPath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".txt"); localFs.delete(outputPath, true); FileOutputFormat.setOutputPath(job, outputPath); boolean result = job.waitForCompletion(true); assertTrue(result); Path outputFilePath = new Path(outputPath, "part-m-00000"); new InputStreamReader(localFs.open(outputFilePath))); String line=reader.readLine(); "null, null, null, null, null, null, null}"); localFs.delete(outputPath, true);
@Test conf.set("hive.exec.orc.default.compress", "SNAPPY"); Job job = new Job(conf, "orc test"); job.setOutputFormatClass(OrcNewOutputFormat.class); job.setJarByClass(TestNewInputOutputFormat.class); job.setMapperClass(OrcTestMapper2.class); job.setNumReduceTasks(0); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(OrcSerdeRow.class); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); boolean result = job.waitForCompletion(true); assertTrue(result); Path outputFilePath = new Path(outputPath, "part-m-00000"); Reader reader = OrcFile.createReader(outputFilePath, OrcFile.readerOptions(conf).filesystem(localFs)); assertEquals(reader.getCompression(), CompressionKind.SNAPPY); localFs.delete(outputPath, true);
job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME)); logger.info("Starting: " + job.getJobName()); attachSegmentMetadataWithAll(cubeSeg, job.getConfiguration()); job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName); job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentID); job.getConfiguration().set(BatchConstants.CFG_CUBOID_MODE, cuboidModeName); job.getConfiguration().set(BatchConstants.CFG_UPDATE_SHARD, ifNeedUpdateBaseCuboidShard); FileInputFormat.setInputPaths(job, new Path(input)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputValueClass(Text.class); Path outputPath = new Path(output); FileOutputFormat.setOutputPath(job, outputPath); HadoopUtil.deletePath(job.getConfiguration(), outputPath);
/** * @throws Exception If failed. */ @Test public void testSimpleTaskSubmit() throws Exception { String testInputFile = "/test"; prepareTestFile(testInputFile); Configuration cfg = new Configuration(); setupFileSystems(cfg); Job job = Job.getInstance(cfg); job.setMapperClass(TestMapper.class); job.setCombinerClass(TestReducer.class); job.setReducerClass(TestReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setNumReduceTasks(1); FileInputFormat.setInputPaths(job, new Path("igfs://:" + getTestIgniteInstanceName(0) + "@/" + testInputFile)); FileOutputFormat.setOutputPath(job, new Path("igfs://:" + getTestIgniteInstanceName(0) + "@/output")); job.setJarByClass(getClass()); IgniteInternalFuture<?> fut = grid(0).hadoop().submit(new HadoopJobId(UUID.randomUUID(), 1), createJobInfo(job.getConfiguration(), null)); fut.get(); }
groupByJob = Job.getInstance( new Configuration(), StringUtils.format("%s-determine_partitions_groupby-%s", config.getDataSource(), config.getIntervals()) ); config.addJobProperties(groupByJob); FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir()); groupByJob.submit(); dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19"); dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionPostGroupByMapper.class); dimSelectionJob.setInputFormatClass(SequenceFileInputFormat.class); FileInputFormat.addInputPath(dimSelectionJob, config.makeGroupedDataDir()); } else { FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath()); dimSelectionJob.submit(); final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity); if (fileSystem == null) { fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration());
parseOptions(options, args); job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME)); String cubeName = getOptionValue(OPTION_CUBE_NAME).toUpperCase(Locale.ROOT); String segmentID = getOptionValue(OPTION_SEGMENT_ID); Path input = new Path(getOptionValue(OPTION_INPUT_PATH)); Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH)); CubeSegment originalSegment = cube.getOriginalSegmentToOptimize(optSegment); logger.info("Starting: " + job.getJobName()); job.setMapperClass(UpdateOldCuboidShardMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName); job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentID); attachSegmentsMetadataWithDict(Lists.newArrayList(optSegment, originalSegment), job.getConfiguration());
@Override public List<WorkUnit> getWorkunits(SourceState state) { List<String> dirs = Splitter.on(",").splitToList(state.getProp(INPUT_DIRECTORIES_KEY)); String outputBase = state.getProp(OUTPUT_LOCATION); List<WorkUnit> workUnits = Lists.newArrayList(); for (String dir : dirs) { try { Path input = new Path(dir); Path output = new Path(outputBase, input.getName()); WorkUnit workUnit = new WorkUnit(); TaskUtils.setTaskFactoryClass(workUnit, MRTaskFactory.class); Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "WordCount_" + input.getName()); job.setJarByClass(MRTaskFactoryTest.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setNumReduceTasks(1); FileInputFormat.addInputPath(job, input); FileOutputFormat.setOutputPath(job, output); MRTask.serializeJobToState(workUnit, job); workUnits.add(workUnit); } catch (IOException ioe) { log.error("Failed to create MR job for " + dir, ioe); } } return workUnits; }
@Test public void testGetSplits() throws Exception { URI baseUri = new URI(GobblinWorkUnitsInputFormatTest.class.getSimpleName() + "://testGetSplits"); Configuration configuration = new Configuration(); Path workUnitsDir = new Path(new Path(baseUri), "/workUnits"); FileSystem fs = Mockito.mock(FileSystem.class); FileStatus[] statuses = createFileStatuses(20, workUnitsDir); Mockito.when(fs.listStatus(workUnitsDir)).thenReturn(statuses); Mockito.when(fs.makeQualified(Mockito.any(Path.class))).thenAnswer(new Answer<Path>() { @Override public Path answer(InvocationOnMock invocation) throws Throwable { return (Path) invocation.getArguments()[0]; } }); FileSystemTestUtils.addFileSystemForTest(baseUri, configuration, fs); GobblinWorkUnitsInputFormat inputFormat = new GobblinWorkUnitsInputFormat(); Job job = Job.getInstance(configuration); FileInputFormat.addInputPath(job, workUnitsDir); List<InputSplit> splits = inputFormat.getSplits(job); Assert.assertEquals(splits.size(), 20); verifyPaths(splits, statuses); }
@Test public void testAddInputPath() throws IOException { final Configuration conf = new Configuration(); conf.set("fs.defaultFS", "file:///abc/"); final Job j = Job.getInstance(conf); //setup default fs final FileSystem defaultfs = FileSystem.get(conf); System.out.println("defaultfs.getUri() = " + defaultfs.getUri()); { //test addInputPath final Path original = new Path("file:/foo"); System.out.println("original = " + original); FileInputFormat.addInputPath(j, original); final Path[] results = FileInputFormat.getInputPaths(j); System.out.println("results = " + Arrays.asList(results)); assertEquals(1, results.length); assertEquals(original, results[0]); } { //test setInputPaths final Path original = new Path("file:/bar"); System.out.println("original = " + original); FileInputFormat.setInputPaths(j, original); final Path[] results = FileInputFormat.getInputPaths(j); System.out.println("results = " + Arrays.asList(results)); assertEquals(1, results.length); assertEquals(original, results[0]); } }
public void checkOutputFormat() throws Exception { Job job = new Job(); WordCountUtil wordCountUtil = new WordCountUtil("trevniMapReduceKeyTest", "part-r-00000"); wordCountUtil.writeLinesFile(); AvroJob.setInputKeySchema(job, STRING); AvroJob.setOutputKeySchema(job, Pair.getPairSchema(STRING,LONG)); job.setMapperClass(WordCountMapper.class); job.setReducerClass(WordCountReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); FileInputFormat.setInputPaths(job, new Path(wordCountUtil.getDir().toString() + "/in")); FileOutputFormat.setOutputPath(job, new Path(wordCountUtil.getDir().toString() + "/out")); FileOutputFormat.setCompressOutput(job, true); job.setInputFormatClass(AvroKeyInputFormat.class); job.setOutputFormatClass(AvroTrevniKeyOutputFormat.class); job.waitForCompletion(true); wordCountUtil.validateCountsFile(); }
parseOptions(options, args); job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME)); String job_id = getOptionValue(OPTION_CUBING_JOB_ID); String cubeName = getOptionValue(OPTION_CUBE_NAME); Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH)); Path input = new Path(getOptionValue(OPTION_INPUT_PATH)); attachCubeMetadata(cube, job.getConfiguration()); Path path = new Path(input.toString() + "/" + tblColRef.getIdentity()); if (HadoopUtil.getFileSystem(path).exists(path)) { FileInputFormat.addInputPath(job, path); hasUHCValue = true; setupReducer(output, reducerCount); job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName); job.getConfiguration().set(BatchConstants.ARG_CUBING_JOB_ID, job_id); job.getConfiguration().set(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR, KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory()); job.getConfiguration().set(BatchConstants.CFG_MAPRED_OUTPUT_COMPRESS, "false");
if (inp.endsWith("/*")) { inp = inp.substring(0, inp.length() - 2); FileSystem fs = HadoopUtil.getWorkingFileSystem(job.getConfiguration()); Path path = new Path(inp); logger.warn("Path not exist:" + path.toString()); continue; FileStatus[] fileStatuses = fs.listStatus(path); boolean hasDir = false; for (FileStatus stat : fileStatuses) { if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) { hasDir = true; ret += addInputDirs(new String[] { stat.getPath().toString() }, job); FileInputFormat.addInputPath(job, new Path(inp)); ret++;
@Test public void testListStatusErrorOnNonExistantDir() throws IOException { Configuration conf = new Configuration(); conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads); configureTestErrorOnNonExistantDir(conf, localFs); Job job = Job.getInstance(conf); FileInputFormat<?, ?> fif = new TextInputFormat(); try { fif.listStatus(job); Assert.fail("Expecting an IOException for a missing Input path"); } catch (IOException e) { Path expectedExceptionPath = new Path(TEST_ROOT_DIR, "input2"); expectedExceptionPath = localFs.makeQualified(expectedExceptionPath); Assert.assertTrue(e instanceof InvalidInputException); Assert.assertEquals( "Input path does not exist: " + expectedExceptionPath.toString(), e.getMessage()); } }