Refine search
/** * Configure <code>job</code> with a TotalOrderPartitioner, partitioning against * <code>splitPoints</code>. Cleans up the partitions file after job exists. */ static void configurePartitioner(Job job, List<ImmutableBytesWritable> splitPoints, boolean writeMultipleTables) throws IOException { Configuration conf = job.getConfiguration(); // create the partitions file FileSystem fs = FileSystem.get(conf); String hbaseTmpFsDir = conf.get(HConstants.TEMPORARY_FS_DIRECTORY_KEY, fs.getHomeDirectory() + "/hbase-staging"); Path partitionsPath = new Path(hbaseTmpFsDir, "partitions_" + UUID.randomUUID()); fs.makeQualified(partitionsPath); writePartitions(conf, partitionsPath, splitPoints, writeMultipleTables); fs.deleteOnExit(partitionsPath); // configure job to use it job.setPartitionerClass(TotalOrderPartitioner.class); TotalOrderPartitioner.setPartitionFile(conf, partitionsPath); }
throws IOException { TableName tableName = TableName.valueOf(args[0]); conf.set(TABLE_NAME, tableName.getNameAsString()); Path inputDir = new Path(args[1]); Job job = Job.getInstance(conf, conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName)); job.setJarByClass(Importer.class); FileInputFormat.setInputPaths(job, inputDir); job.setInputFormatClass(SequenceFileInputFormat.class); String hfileOutPath = conf.get(BULK_OUTPUT_CONF_KEY); job.setMapperClass(CellSortImporter.class); job.setReducerClass(CellReducer.class); Path outputDir = new Path(hfileOutPath); FileOutputFormat.setOutputPath(job, outputDir); job.setMapOutputKeyClass(CellWritableComparable.class); RawComparator.class); Path partitionsPath = new Path(TotalOrderPartitioner.getPartitionFile(job.getConfiguration())); FileSystem fs = FileSystem.get(job.getConfiguration()); fs.deleteOnExit(partitionsPath); job.setPartitionerClass(CellWritableComparablePartitioner.class); job.setNumReduceTasks(regionLocator.getStartKeys().length);
protected Job createJob(String[] args) throws Exception { Job job = new Job(); job.setJobName( "ArchiveCDXGenerator" + "_" + System.currentTimeMillis()); Configuration conf = job.getConfiguration(); } else { job.setPartitionerClass(TotalOrderPartitioner.class); TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), new Path(outputPath, "_partitions.lst")); FileSystem fs = input.getFileSystem(conf); FileStatus inputStatus = fs.getFileStatus(input); FileInputFormat.setMaxInputSplitSize(job, inputStatus.getLen() / getNumMapTasks(new Path(this.inputPath), conf));
try { this.conf = conf; String parts = getPartitionFile(conf); final Path partFile = new Path(parts); final FileSystem fs = (DEFAULT_PATH.equals(parts)) ? FileSystem.getLocal(conf) // assume in DistributedCache : partFile.getFileSystem(conf); Job job = Job.getInstance(conf); Class<K> keyClass = (Class<K>)job.getMapOutputKeyClass(); K[] splitPoints = readPartitions(fs, partFile, keyClass, conf); if (splitPoints.length != job.getNumReduceTasks() - 1) { throw new IOException("Wrong number of partitions in keyset"); conf.getBoolean(NATURAL_ORDER, true); if (natOrder && BinaryComparable.class.isAssignableFrom(keyClass)) { partitions = buildTrie((BinaryComparable[])splitPoints, 0, splitPoints.length, new byte[0], conf.getInt(MAX_TRIE_DEPTH, 200)); } else { partitions = new BinarySearchNode(splitPoints, comparator);
public Job createSubmittableJob(String[] args) throws IOException { Path partitionsPath = new Path(destPath, PARTITIONS_FILE_NAME); generatePartitions(partitionsPath); Job job = Job.getInstance(getConf(), getConf().get("mapreduce.job.name", "hashTable_" + tableHash.tableName)); Configuration jobConf = job.getConfiguration(); jobConf.setLong(HASH_BATCH_SIZE_CONF_KEY, tableHash.batchSize); job.setJarByClass(HashTable.class); TableMapReduceUtil.initTableMapperJob(tableHash.tableName, tableHash.initScan(), HashMapper.class, ImmutableBytesWritable.class, ImmutableBytesWritable.class, job); // use a TotalOrderPartitioner and reducers to group region output into hash files job.setPartitionerClass(TotalOrderPartitioner.class); TotalOrderPartitioner.setPartitionFile(jobConf, partitionsPath); job.setReducerClass(Reducer.class); // identity reducer job.setNumReduceTasks(tableHash.numHashFiles); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(ImmutableBytesWritable.class); job.setOutputFormatClass(MapFileOutputFormat.class); FileOutputFormat.setOutputPath(job, new Path(destPath, HASH_DATA_DIR)); return job; }
throws IOException, ClassNotFoundException, InterruptedException { LinkedList<K> splits = new LinkedList<K>(); Configuration conf = job.getConfiguration(); final InputFormat inf = ReflectionUtils.newInstance(job.getInputFormatClass(), conf); int numPartitions = job.getNumReduceTasks(); K[] samples = (K[])sampler.getSample(inf, job); LOG.info("Using " + samples.length + " samples"); RawComparator<K> comparator = (RawComparator<K>) job.getGroupingComparator(); Arrays.sort(samples, comparator); Path dst = new Path(TotalOrderPartitioner.getPartitionFile(conf)); FileSystem fs = dst.getFileSystem(conf); if (fs.exists(dst)) fs.delete(dst, false); SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, dst, job.getMapOutputKeyClass(), NullWritable.class); NullWritable nullValue = NullWritable.get();
public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException { Path inputDir = new Path(args[0]); Path outputDir = new Path(args[1]); boolean createPartitionFile = Boolean.parseBoolean(args[2]); Job job = Job.getInstance(conf, "Import delicious RSS feed into Hush tables."); job.setJarByClass(BulkImportJobExample.class); job.setInputFormatClass(TextInputFormat.class); job.getConfiguration().set("hfile.compression", "gz"); Path partitionsPath = new Path(job.getWorkingDirectory(), "partitions_" + System.currentTimeMillis()); TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionsPath);
@SuppressWarnings("unchecked") @Override public int run(String[] args) throws Exception { Configuration conf = new Configuration(); Path inputPath = new Path(args[0]); Path partitionFile = new Path(args[1] + "_partitions.lst"); Path outputStage = new Path(args[1] + "_staging"); Path outputOrder = new Path(args[1]); Job sampleJob = new Job(conf, "TotalOrderSortingStage"); sampleJob.setJarByClass(TotalOrderSortingStage.class); sampleJob.setMapperClass(LastAccessMapper.class); sampleJob.setNumReduceTasks(0); sampleJob.setOutputKeyClass(Text.class); orderJob.setPartitionerClass(TotalOrderPartitioner.class); TotalOrderPartitioner.setPartitionFile(orderJob.getConfiguration(), partitionFile); orderJob.setOutputKeyClass(Text.class); orderJob.getConfiguration().set( "mapred.textoutputformat.separator", ""); FileSystem.get(new Configuration()).delete(partitionFile, false); FileSystem.get(new Configuration()).delete(outputStage, true); return code;
public void testTotalOrderMemCmp() throws Exception { TotalOrderPartitioner<Text,NullWritable> partitioner = new TotalOrderPartitioner<Text,NullWritable>(); Configuration conf = new Configuration(); Path p = TestTotalOrderPartitioner.<Text>writePartitionFile( "totalordermemcmp", conf, splitStrings); conf.setClass(MRJobConfig.MAP_OUTPUT_KEY_CLASS, Text.class, Object.class); try { partitioner.setConf(conf); NullWritable nw = NullWritable.get(); for (Check<Text> chk : testStrings) { assertEquals(chk.data.toString(), chk.part, partitioner.getPartition(chk.data, nw, splitStrings.length + 1)); } } finally { p.getFileSystem(conf).delete(p, true); } }
ClusterStatus cluster = client.getClusterStatus(); int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9); String sort_reduces = conf.get(REDUCES_PER_HOST); if (sort_reduces != null) { num_reduces = cluster.getTaskTrackers() * job = Job.getInstance(conf); job.setJobName("sorter"); job.setJarByClass(Sort.class); job.setMapperClass(Mapper.class); FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(1))); job.setPartitionerClass(TotalOrderPartitioner.class); Path inputDir = FileInputFormat.getInputPaths(job)[0]; FileSystem fs = inputDir.getFileSystem(conf); inputDir = inputDir.makeQualified(fs.getUri(), fs.getWorkingDirectory()); Path partitionFile = new Path(inputDir, "_sortPartitioning"); TotalOrderPartitioner.setPartitionFile(conf, partitionFile); InputSampler.<K,V>writePartitionFile(job, sampler); URI partitionUri = new URI(partitionFile.toString() +
ClusterStatus cluster = client.getClusterStatus(); int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9); String sort_reduces = conf.get(REDUCES_PER_HOST); if (sort_reduces != null) { num_reduces = cluster.getTaskTrackers() * job = new Job(conf); job.setJobName("sorter"); job.setJarByClass(Sort.class); job.setMapperClass(Mapper.class); FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(1))); job.setPartitionerClass(TotalOrderPartitioner.class); Path inputDir = FileInputFormat.getInputPaths(job)[0]; inputDir = inputDir.makeQualified(inputDir.getFileSystem(conf)); Path partitionFile = new Path(inputDir, "_sortPartitioning"); TotalOrderPartitioner.setPartitionFile(conf, partitionFile); InputSampler.<K,V>writePartitionFile(job, sampler); URI partitionUri = new URI(partitionFile.toString() +
/** * Configure <code>job</code> with a TotalOrderPartitioner, partitioning against * <code>splitPoints</code>. Cleans up the partitions file after job exists. */ static void configurePartitioner(Job job, Set<TableRowkeyPair> tablesStartKeys) throws IOException { Configuration conf = job.getConfiguration(); // create the partitions file Path partitionsPath = new Path(conf.get("hadoop.tmp.dir"), "partitions_" + UUID.randomUUID()); FileSystem fs = partitionsPath.getFileSystem(conf); fs.makeQualified(partitionsPath); writePartitions(conf, partitionsPath, tablesStartKeys); fs.deleteOnExit(partitionsPath); // configure job to use it job.setPartitionerClass(TotalOrderPartitioner.class); TotalOrderPartitioner.setPartitionFile(conf, partitionsPath); }
private int createParitionFile(String sequenceFileInput, int regionCnt) throws IOException, ClassNotFoundException, InterruptedException { Path input = new Path(sequenceFileInput); Job sampler = new Job(getConf()); sampler.setNumReduceTasks(regionCnt); sampler.setInputFormatClass(SequenceFileInputFormat.class); sampler.setOutputFormatClass(SequenceFileOutputFormat.class); sampler.setOutputKeyClass(BytesWritable.class); SequenceFileInputFormat.addInputPath(sampler, input); Configuration config = sampler.getConfiguration(); InputSampler.Sampler<BytesWritable, NullWritable> inputSampler = new InputSampler.RandomSampler<BytesWritable, NullWritable>( config.getFloat(SAMPLER_FREQUENCY_KEY, SAMPLER_FREQUENCY_DEFAULT_VALUE), config.getInt(SAMPLER_NUM_SAMPLES_KEY, SAMPLER_NUM_SAMPLES_DEFAULT_VALUE)); Path partitionFile = new Path(config.get(OUTPUT_KEYS_FILE_NAME_KEY, OUTPUT_KEYS_FILE_NAME_DEFAULT_VALUE)); TotalOrderPartitioner.setPartitionFile(config, partitionFile); InputSampler.<BytesWritable, NullWritable>writePartitionFile(sampler, inputSampler); return 0; }
Job job = Job.getInstance(getConf()); ArrayList<String> otherArgs = new ArrayList<String>(); Sampler<K,V> sampler = null; try { if ("-r".equals(args[i])) { job.setNumReduceTasks(Integer.parseInt(args[++i])); } else if ("-inFormat".equals(args[i])) { job.setInputFormatClass( Class.forName(args[++i]).asSubclass(InputFormat.class)); } else if ("-keyClass".equals(args[i])) { Path outf = new Path(otherArgs.remove(otherArgs.size() - 1)); TotalOrderPartitioner.setPartitionFile(getConf(), outf); for (String s : otherArgs) { FileInputFormat.addInputPath(job, new Path(s));
/** * Confirm the absence of the {@link TotalOrderPartitioner} partitions file. */ protected static void validateDeletedPartitionsFile(Configuration conf) throws IOException { if (!conf.getBoolean(IntegrationTestingUtility.IS_DISTRIBUTED_CLUSTER, false)) return; FileSystem fs = FileSystem.get(conf); Path partitionsFile = new Path(TotalOrderPartitioner.getPartitionFile(conf)); assertFalse("Failed to clean up partitions file.", fs.exists(partitionsFile)); }
/** * Check if there's partition files for hfile, if yes replace the table splits, to make the job more reducers * @param conf the job configuration * @param path the hfile partition file * @throws IOException */ @SuppressWarnings("deprecation") private void reconfigurePartitions(Configuration conf, Path path) throws IOException { FileSystem fs = path.getFileSystem(conf); if (fs.exists(path)) { try (SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf)) { int partitionCount = 0; Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf); Writable value = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf); while (reader.next(key, value)) { partitionCount++; } TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), path); // The reduce tasks should be one more than partition keys job.setNumReduceTasks(partitionCount + 1); } } else { logger.info("File '" + path.toString() + " doesn't exist, will not reconfigure hfile Partitions"); } }
private static Job createSubmittableJob(Configuration conf, String tableName, Path inputDir, Path scratchDir, boolean localMode) throws IOException { HBaseHCatStorageHandler.setHBaseSerializers(conf); Job job = new Job(conf, NAME + "_" + tableName); job.setJarByClass(SequenceFileImporter.class); FileInputFormat.setInputPaths(job, inputDir); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(SequenceFileImporter.class); URI partitionURI; try { partitionURI = new URI(TotalOrderPartitioner.getPartitionFile(job.getConfiguration()) + "#" + TotalOrderPartitioner.DEFAULT_PATH); } catch (URISyntaxException e) { job.getConfiguration().set(TotalOrderPartitioner.PARTITIONER_PATH, partitionFile.toString());
private static <T extends WritableComparable<?>> Path writePartitionFile( String testname, Configuration conf, T[] splits) throws IOException { final FileSystem fs = FileSystem.getLocal(conf); final Path testdir = new Path(System.getProperty("test.build.data", "/tmp") ).makeQualified(fs); Path p = new Path(testdir, testname + "/_partition.lst"); TotalOrderPartitioner.setPartitionFile(conf, p); conf.setInt(MRJobConfig.NUM_REDUCES, splits.length + 1); SequenceFile.Writer w = null; try { w = SequenceFile.createWriter(fs, conf, p, splits[0].getClass(), NullWritable.class, SequenceFile.CompressionType.NONE); for (int i = 0; i < splits.length; ++i) { w.append(splits[i], NullWritable.get()); } } finally { if (null != w) w.close(); } return p; }
Job job = new Job(getConf()); ArrayList<String> otherArgs = new ArrayList<String>(); Sampler<K,V> sampler = null; try { if ("-r".equals(args[i])) { job.setNumReduceTasks(Integer.parseInt(args[++i])); } else if ("-inFormat".equals(args[i])) { job.setInputFormatClass( Class.forName(args[++i]).asSubclass(InputFormat.class)); } else if ("-keyClass".equals(args[i])) { Path outf = new Path(otherArgs.remove(otherArgs.size() - 1)); TotalOrderPartitioner.setPartitionFile(getConf(), outf); for (String s : otherArgs) { FileInputFormat.addInputPath(job, new Path(s));
private int createParitionFile(String inputPath, String outputFile, float frequency, int samplesCnt) throws IOException, ClassNotFoundException, InterruptedException { Path input = new Path(inputPath); Job sampler = new Job(getConf()); TextInputFormat.addInputPath(sampler, input); InputSampler.Sampler<LongWritable, Text> inputSampler = new InputSampler.RandomSampler<LongWritable, Text>(frequency, samplesCnt); Path partitionFile = new Path(outputFile); TotalOrderPartitioner.setPartitionFile(sampler.getConfiguration(), partitionFile); InputSampler.writePartitionFile(sampler, inputSampler); return 0; }