@Override public byte[] apply(byte[] input) { return rowKeyDistributor.getOriginalKey(input); } };
@Override public byte[] getActualRowKey(ConsumerConfig consumerConfig, byte[] originalRowKey) { return rowKeyDistributor.getDistributedKey(originalRowKey); }
byte[][] bucketSplits = keyDistributor.getAllDistributedKeys(Bytes.EMPTY_BYTE_ARRAY); Preconditions.checkArgument(splits >= 1 && splits <= MAX_SPLIT_COUNT_PER_BUCKET * bucketSplits.length, "Number of pre-splits should be in [1.." +
/** Testing simple get. */ @Test public void testGet() throws IOException, InterruptedException { // Testing simple get byte[] key = new byte[] {123, 124, 122}; byte[] distributedKey = keyDistributor.getDistributedKey(key); byte[] value = Bytes.toBytes("some"); hTable.put(new Put(distributedKey).add(CF, QUAL, value)); Result result = hTable.get(new Get(distributedKey)); Assert.assertArrayEquals(key, keyDistributor.getOriginalKey(result.getRow())); Assert.assertArrayEquals(value, result.getValue(CF, QUAL)); }
@Override public List<InputSplit> getSplits(JobContext context) throws IOException { List<InputSplit> allSplits = new ArrayList<>(); Scan originalScan = getScan(); Scan[] scans = rowKeyDistributor.getDistributedScans(originalScan); for (Scan scan : scans) { // Internally super.getSplits(...) uses scan object stored in private variable, // to re-use the code of super class we switch scan object with scans we setScan(scan); List<InputSplit> splits = super.getSplits(context); allSplits.addAll(splits); } // Setting original scan back setScan(originalScan); return allSplits; } }
public final Scan[] getDistributedScans(Scan original) throws IOException { Pair<byte[], byte[]>[] intervals = getDistributedIntervals(original.getStartRow(), original.getStopRow()); Scan[] scans = new Scan[intervals.length]; for (int i = 0; i < intervals.length; i++) { scans[i] = new Scan(original); scans[i].setStartRow(intervals[i].getFirst()); scans[i].setStopRow(intervals[i].getSecond()); } return scans; }
private ScanBuilder configureRangeScan(ScanBuilder scan, @Nullable byte[] startRow, @Nullable byte[] stopRow, @Nullable FuzzyRowFilter filter) { // todo: should be configurable scan.setCaching(1000); if (startRow != null) { scan.setStartRow(startRow); } if (stopRow != null) { scan.setStopRow(stopRow); } scan.addFamily(columnFamily); if (filter != null) { List<Pair<byte[], byte[]>> fuzzyPairs = Lists.newArrayListWithExpectedSize(filter.getFuzzyKeysData().size()); for (ImmutablePair<byte[], byte[]> pair : filter.getFuzzyKeysData()) { if (rowKeyDistributor != null) { fuzzyPairs.addAll(rowKeyDistributor.getDistributedFilterPairs(pair)); } else { // Make a copy of filter pair because the key and mask will get modified in HBase FuzzyRowFilter. fuzzyPairs.add(Pair.newPair(Arrays.copyOf(pair.getFirst(), pair.getFirst().length), Arrays.copyOf(pair.getSecond(), pair.getSecond().length))); } } scan.setFilter(new org.apache.hadoop.hbase.filter.FuzzyRowFilter(fuzzyPairs)); } return scan; }
private void testMapReduceInternal(long origKeyPrefix, Scan scan, int numValues, int startWithValue, int seekIntervalMinValue, int seekIntervalMaxValue) throws IOException, InterruptedException, ClassNotFoundException { int valuesCountInSeekInterval = writeTestData(origKeyPrefix, numValues, startWithValue, seekIntervalMinValue, seekIntervalMaxValue); // Reading data Configuration conf = new Configuration(testingUtility.getConfiguration()); conf.set("fs.defaultFS", "file:///"); conf.set("fs.default.name", "file:///"); conf.setInt("mapreduce.local.map.tasks.maximum", 16); conf.setInt("mapreduce.local.reduce.tasks.maximum", 16); Job job = Job.getInstance(conf, "testMapReduceInternal()-Job"); TableMapReduceUtil.initTableMapperJob(TABLE_NAME, scan, RowCounterMapper.class, ImmutableBytesWritable.class, Result.class, job); // Substituting standard TableInputFormat which was set in TableMapReduceUtil.initTableMapperJob(...) job.setInputFormatClass(WdTableInputFormat.class); keyDistributor.addInfo(job.getConfiguration()); job.setOutputFormatClass(NullOutputFormat.class); job.setNumReduceTasks(0); boolean succeeded = job.waitForCompletion(true); Assert.assertTrue(succeeded); long mapInputRecords = job.getCounters().findCounter(RowCounterMapper.Counters.ROWS).getValue(); Assert.assertEquals(valuesCountInSeekInterval, mapInputRecords); // Need to kill the job after completion, after it could leave MRAppMaster running not terminated. // Not sure what causing this, but maybe problem in MiniYarnCluster job.killJob(); }
@Override public List<InputSplit> getSplits(JobContext context) throws IOException { List<InputSplit> allSplits = new ArrayList<>(); Scan originalScan = getScan(); Scan[] scans = rowKeyDistributor.getDistributedScans(originalScan); for (Scan scan : scans) { // Internally super.getSplits(...) uses scan object stored in private variable, // to re-use the code of super class we switch scan object with scans we setScan(scan); List<InputSplit> splits = super.getSplits(context); allSplits.addAll(splits); } // Setting original scan back setScan(originalScan); return allSplits; } }
public final Scan[] getDistributedScans(Scan original) throws IOException { Pair<byte[], byte[]>[] intervals = getDistributedIntervals(original.getStartRow(), original.getStopRow()); Scan[] scans = new Scan[intervals.length]; for (int i = 0; i < intervals.length; i++) { scans[i] = new Scan(original); scans[i].setStartRow(intervals[i].getFirst()); scans[i].setStopRow(intervals[i].getSecond()); } return scans; }
private ScanBuilder configureRangeScan(ScanBuilder scan, @Nullable byte[] startRow, @Nullable byte[] stopRow, @Nullable FuzzyRowFilter filter) { // todo: should be configurable scan.setCaching(1000); if (startRow != null) { scan.setStartRow(startRow); } if (stopRow != null) { scan.setStopRow(stopRow); } scan.addFamily(columnFamily); if (filter != null) { List<Pair<byte[], byte[]>> fuzzyPairs = Lists.newArrayListWithExpectedSize(filter.getFuzzyKeysData().size()); for (ImmutablePair<byte[], byte[]> pair : filter.getFuzzyKeysData()) { if (rowKeyDistributor != null) { fuzzyPairs.addAll(rowKeyDistributor.getDistributedFilterPairs(pair)); } else { // Make a copy of filter pair because the key and mask will get modified in HBase FuzzyRowFilter. fuzzyPairs.add(Pair.newPair(Arrays.copyOf(pair.getFirst(), pair.getFirst().length), Arrays.copyOf(pair.getSecond(), pair.getSecond().length))); } } scan.setFilter(new org.apache.hadoop.hbase.filter.FuzzyRowFilter(fuzzyPairs)); } return scan; }
@Override public byte[] getRow() { return rowKeyDistributor.getOriginalKey(result.getRow()); } };
private byte[] createDistributedRowKey(byte[] row) { return rowKeyDistributor == null ? row : rowKeyDistributor.getDistributedKey(row); }
byte[][] bucketSplits = keyDistributor.getAllDistributedKeys(Bytes.EMPTY_BYTE_ARRAY); Preconditions.checkArgument(splits >= 1 && splits <= MAX_SPLIT_COUNT_PER_BUCKET * bucketSplits.length, "Number of pre-splits should be in [1.." +
public static DistributedScanner create(HTableInterface hTable, Scan originalScan, AbstractRowKeyDistributor keyDistributor, ExecutorService scansExecutor) throws IOException { Scan[] scans = keyDistributor.getDistributedScans(originalScan); ResultScanner[] rss = new ResultScanner[scans.length]; for (int i = 0; i < scans.length; i++) { rss[i] = hTable.getScanner(scans[i]); } int caching = originalScan.getCaching(); // to optimize work of distributed scan we need to know that, so we are resolving it from config in the case it is // not set for scan if (caching < 1) { caching = hTable.getConfiguration().getInt("hbase.client.scanner.caching", 1); } return new DistributedScanner(keyDistributor, rss, caching, scansExecutor); }
@Override public byte[] getRow() { return rowKeyDistributor.getOriginalKey(result.getRow()); } };
private byte[] createDistributedRowKey(byte[] row) { return rowKeyDistributor == null ? row : rowKeyDistributor.getDistributedKey(row); }
byte[][] bucketSplits = getAllDistributedKeys(co.cask.cdap.api.common.Bytes.EMPTY_BYTE_ARRAY); Preconditions.checkArgument(splits >= 1 && splits <= 0xff * bucketSplits.length, "Number of pre-splits should be in [1.." +
public static DistributedScanner create(HTableInterface hTable, Scan originalScan, AbstractRowKeyDistributor keyDistributor, ExecutorService scansExecutor) throws IOException { Scan[] scans = keyDistributor.getDistributedScans(originalScan); ResultScanner[] rss = new ResultScanner[scans.length]; for (int i = 0; i < scans.length; i++) { rss[i] = hTable.getScanner(scans[i]); } int caching = originalScan.getCaching(); // to optimize work of distributed scan we need to know that, so we are resolving it from config in the case it is // not set for scan if (caching < 1) { caching = hTable.getConfiguration().getInt("hbase.client.scanner.caching", 1); } return new DistributedScanner(keyDistributor, rss, caching, scansExecutor); }
@Override public byte[] getRow() { return keyDistributor.getOriginalKey(result.getRow()); }