/** * Confirm the absence of the {@link TotalOrderPartitioner} partitions file. */ protected static void validateDeletedPartitionsFile(Configuration conf) throws IOException { if (!conf.getBoolean(IntegrationTestingUtility.IS_DISTRIBUTED_CLUSTER, false)) return; FileSystem fs = FileSystem.get(conf); Path partitionsFile = new Path(TotalOrderPartitioner.getPartitionFile(conf)); assertFalse("Failed to clean up partitions file.", fs.exists(partitionsFile)); }
/** * Configure <code>job</code> with a TotalOrderPartitioner, partitioning against * <code>splitPoints</code>. Cleans up the partitions file after job exists. */ static void configurePartitioner(Job job, List<ImmutableBytesWritable> splitPoints) throws IOException { Configuration conf = job.getConfiguration(); // create the partitions file FileSystem fs = FileSystem.get(conf); Path partitionsPath = new Path(conf.get("hbase.fs.tmp.dir"), "partitions_" + RandomUtil.randomUUID()); fs.makeQualified(partitionsPath); writePartitions(conf, partitionsPath, splitPoints); fs.deleteOnExit(partitionsPath); // configure job to use it job.setPartitionerClass(TotalOrderPartitioner.class); TotalOrderPartitioner.setPartitionFile(conf, partitionsPath); }
ReflectionUtils.newInstance(job.getInputFormatClass(), conf); int numPartitions = job.getNumReduceTasks(); K[] samples = (K[])sampler.getSample(inf, job); LOG.info("Using " + samples.length + " samples"); RawComparator<K> comparator = (RawComparator<K>) job.getGroupingComparator(); Arrays.sort(samples, comparator); Path dst = new Path(TotalOrderPartitioner.getPartitionFile(conf)); FileSystem fs = dst.getFileSystem(conf); if (fs.exists(dst)) fs.delete(dst, false);
/** * Given a sorted set of cut points, build a trie that will find the correct * partition quickly. * @param splits the list of cut points * @param lower the lower bound of partitions 0..numPartitions-1 * @param upper the upper bound of partitions 0..numPartitions-1 * @param prefix the prefix that we have already checked against * @param maxDepth the maximum depth we will build a trie for * @return the trie node that will divide the splits correctly */ private TrieNode buildTrie(BinaryComparable[] splits, int lower, int upper, byte[] prefix, int maxDepth) { return buildTrieRec (splits, lower, upper, prefix, maxDepth, new CarriedTrieNodeRef()); }
public void configure(JobConf job) { super.setConf(job); } }
public void configure(JobConf job) { super.setConf(job); } }
@SuppressWarnings("unchecked") // is memcmp-able and uses the trie public int getPartition(K key, V value, int numPartitions) { return partitions.findPartition(key); }
/** test partitioner for patterns */ @Test public void testPatterns() { int results[] = new int[PARTITIONS]; RehashPartitioner <IntWritable, NullWritable> p = new RehashPartitioner < IntWritable, NullWritable> (); /* test sequence 4, 8, 12, ... 128 */ for(int i = 0; i < END; i+= STEP) { results[p.getPartition(new IntWritable(i), null, PARTITIONS)]++; } int badbuckets = 0; Integer min = Collections.min(Arrays.asList(ArrayUtils.toObject(results))); Integer max = Collections.max(Arrays.asList(ArrayUtils.toObject(results))); Integer avg = (int) Math.round((max+min)/2.0); System.out.println("Dumping buckets distribution: min="+min+" avg="+avg+" max="+max); for (int i = 0; i < PARTITIONS; i++) { double var = (results[i]-avg)/(double)(avg); System.out.println("bucket "+i+" "+results[i]+" items, variance "+var); if (Math.abs(var) > MAX_ERROR) badbuckets++; } System.out.println(badbuckets + " of "+PARTITIONS+" are too small or large buckets"); assertTrue("too many overflow buckets", badbuckets < PARTITIONS * MAX_BADBUCKETS); } }
/** * Configure <code>job</code> with a TotalOrderPartitioner, partitioning against * <code>splitPoints</code>. Cleans up the partitions file after job exists. */ static void configurePartitioner(Job job, List<ImmutableBytesWritable> splitPoints, boolean writeMultipleTables) throws IOException { Configuration conf = job.getConfiguration(); // create the partitions file FileSystem fs = FileSystem.get(conf); String hbaseTmpFsDir = conf.get(HConstants.TEMPORARY_FS_DIRECTORY_KEY, fs.getHomeDirectory() + "/hbase-staging"); Path partitionsPath = new Path(hbaseTmpFsDir, "partitions_" + UUID.randomUUID()); fs.makeQualified(partitionsPath); writePartitions(conf, partitionsPath, splitPoints, writeMultipleTables); fs.deleteOnExit(partitionsPath); // configure job to use it job.setPartitionerClass(TotalOrderPartitioner.class); TotalOrderPartitioner.setPartitionFile(conf, partitionsPath); }
/** * Given a sorted set of cut points, build a trie that will find the correct * partition quickly. * @param splits the list of cut points * @param lower the lower bound of partitions 0..numPartitions-1 * @param upper the upper bound of partitions 0..numPartitions-1 * @param prefix the prefix that we have already checked against * @param maxDepth the maximum depth we will build a trie for * @return the trie node that will divide the splits correctly */ private TrieNode buildTrie(BinaryComparable[] splits, int lower, int upper, byte[] prefix, int maxDepth) { return buildTrieRec (splits, lower, upper, prefix, maxDepth, new CarriedTrieNodeRef()); }
RawComparator.class); Path partitionsPath = new Path(TotalOrderPartitioner.getPartitionFile(job.getConfiguration())); FileSystem fs = FileSystem.get(job.getConfiguration()); fs.deleteOnExit(partitionsPath);
public void configure(JobConf job) { super.setConf(job); } }
/** * Check if there's partition files for hfile, if yes replace the table splits, to make the job more reducers * @param conf the job configuration * @param path the hfile partition file * @throws IOException */ @SuppressWarnings("deprecation") private void reconfigurePartitions(Configuration conf, Path path) throws IOException { FileSystem fs = path.getFileSystem(conf); if (fs.exists(path)) { try (SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf)) { int partitionCount = 0; Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf); Writable value = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf); while (reader.next(key, value)) { partitionCount++; } TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), path); // The reduce tasks should be one more than partition keys job.setNumReduceTasks(partitionCount + 1); } } else { logger.info("File '" + path.toString() + " doesn't exist, will not reconfigure hfile Partitions"); } }
/** * Given a sorted set of cut points, build a trie that will find the correct * partition quickly. * @param splits the list of cut points * @param lower the lower bound of partitions 0..numPartitions-1 * @param upper the upper bound of partitions 0..numPartitions-1 * @param prefix the prefix that we have already checked against * @param maxDepth the maximum depth we will build a trie for * @return the trie node that will divide the splits correctly */ private TrieNode buildTrie(BinaryComparable[] splits, int lower, int upper, byte[] prefix, int maxDepth) { return buildTrieRec (splits, lower, upper, prefix, maxDepth, new CarriedTrieNodeRef()); }
public void configure(JobConf job) { super.setConf(job); } }
public Job createSubmittableJob(String[] args) throws IOException { Path partitionsPath = new Path(destPath, PARTITIONS_FILE_NAME); generatePartitions(partitionsPath); Job job = Job.getInstance(getConf(), getConf().get("mapreduce.job.name", "hashTable_" + tableHash.tableName)); Configuration jobConf = job.getConfiguration(); jobConf.setLong(HASH_BATCH_SIZE_CONF_KEY, tableHash.batchSize); job.setJarByClass(HashTable.class); TableMapReduceUtil.initTableMapperJob(tableHash.tableName, tableHash.initScan(), HashMapper.class, ImmutableBytesWritable.class, ImmutableBytesWritable.class, job); // use a TotalOrderPartitioner and reducers to group region output into hash files job.setPartitionerClass(TotalOrderPartitioner.class); TotalOrderPartitioner.setPartitionFile(jobConf, partitionsPath); job.setReducerClass(Reducer.class); // identity reducer job.setNumReduceTasks(tableHash.numHashFiles); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(ImmutableBytesWritable.class); job.setOutputFormatClass(MapFileOutputFormat.class); FileOutputFormat.setOutputPath(job, new Path(destPath, HASH_DATA_DIR)); return job; }
/** * Given a sorted set of cut points, build a trie that will find the correct * partition quickly. * @param splits the list of cut points * @param lower the lower bound of partitions 0..numPartitions-1 * @param upper the upper bound of partitions 0..numPartitions-1 * @param prefix the prefix that we have already checked against * @param maxDepth the maximum depth we will build a trie for * @return the trie node that will divide the splits correctly */ private TrieNode buildTrie(BinaryComparable[] splits, int lower, int upper, byte[] prefix, int maxDepth) { return buildTrieRec (splits, lower, upper, prefix, maxDepth, new CarriedTrieNodeRef()); }
public void configure(JobConf job) { super.setConf(job); } }
/** * Configure <code>job</code> with a TotalOrderPartitioner, partitioning against * <code>splitPoints</code>. Cleans up the partitions file after job exists. */ static void configurePartitioner(Job job, Set<TableRowkeyPair> tablesStartKeys) throws IOException { Configuration conf = job.getConfiguration(); // create the partitions file Path partitionsPath = new Path(conf.get("hadoop.tmp.dir"), "partitions_" + UUID.randomUUID()); FileSystem fs = partitionsPath.getFileSystem(conf); fs.makeQualified(partitionsPath); writePartitions(conf, partitionsPath, tablesStartKeys); fs.deleteOnExit(partitionsPath); // configure job to use it job.setPartitionerClass(TotalOrderPartitioner.class); TotalOrderPartitioner.setPartitionFile(conf, partitionsPath); }
TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionsPath);