@Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { WorkUnit workUnit = (value.toString().endsWith(MULTI_WORK_UNIT_FILE_EXTENSION) ? MultiWorkUnit.createEmpty() : WorkUnit.createEmpty()); SerializationUtils.deserializeState(this.fs, new Path(value.toString()), workUnit); if (workUnit instanceof MultiWorkUnit) { List<WorkUnit> flattenedWorkUnits = JobLauncherUtils.flattenWorkUnits(((MultiWorkUnit) workUnit).getWorkUnits()); this.workUnits.addAll(flattenedWorkUnits); } else { this.workUnits.add(workUnit); } } }
/** * Pack the list of {@code WorkUnit}s into {@code MultiWorkUnit}s. * * TODO: this is currently a simple round-robin packing. More sophisticated bin packing may be necessary * if the round-robin approach leads to mapper skew. */ private static List<WorkUnit> pack(List<WorkUnit> workUnits, int numOfMultiWorkunits) { Preconditions.checkArgument(numOfMultiWorkunits > 0); if (workUnits.size() <= numOfMultiWorkunits) { return workUnits; } List<WorkUnit> result = Lists.newArrayListWithCapacity(numOfMultiWorkunits); for (int i = 0; i < numOfMultiWorkunits; i++) { result.add(MultiWorkUnit.createEmpty()); } for (int i = 0; i < workUnits.size(); i++) { ((MultiWorkUnit) result.get(i % numOfMultiWorkunits)).addWorkUnit(workUnits.get(i)); } return result; }
protected static void addWorkUnitToMultiWorkUnit(WorkUnit workUnit, MultiWorkUnit multiWorkUnit) { multiWorkUnit.addWorkUnit(workUnit); double size = multiWorkUnit.getPropAsDouble(ESTIMATED_WORKUNIT_SIZE, 0.0); multiWorkUnit.setProp(ESTIMATED_WORKUNIT_SIZE, size + getWorkUnitEstSize(workUnit)); }
protected static List<List<KafkaPartition>> getMultiWorkUnitPartitions(MultiWorkUnit mwu) { List<List<KafkaPartition>> partitions = Lists.newArrayList(); for (WorkUnit workUnit : mwu.getWorkUnits()) { partitions.add(KafkaUtils.getPartitions(workUnit)); } return partitions; }
MultiWorkUnit mwu = MultiWorkUnit.createEmpty(); try { mwu.readFields(workUnitFileCloser.register(new DataInputStream(fs.open(status.getPath())))); } finally { workUnitFileCloser.close(); for (WorkUnit wu : mwu.getWorkUnits()) { JobLauncherUtils.cleanTaskStagingData(new WorkUnitState(wu), LOG);
/** * Group {@link WorkUnit}s into groups. Each group is a {@link MultiWorkUnit}. Each group has a capacity of * avgGroupSize. If there's a single {@link WorkUnit} whose size is larger than avgGroupSize, it forms a group itself. */ private static List<MultiWorkUnit> bestFitDecreasingBinPacking(List<WorkUnit> workUnits, double avgGroupSize) { // Sort workunits by data size desc Collections.sort(workUnits, LOAD_DESC_COMPARATOR); PriorityQueue<MultiWorkUnit> pQueue = new PriorityQueue<>(workUnits.size(), LOAD_DESC_COMPARATOR); for (WorkUnit workUnit : workUnits) { MultiWorkUnit bestGroup = findAndPopBestFitGroup(workUnit, pQueue, avgGroupSize); if (bestGroup != null) { addWorkUnitToMultiWorkUnit(workUnit, bestGroup); } else { bestGroup = MultiWorkUnit.createEmpty(); addWorkUnitToMultiWorkUnit(workUnit, bestGroup); } pQueue.add(bestGroup); } return Lists.newArrayList(pQueue); }
@BeforeClass public void setupWorkUnitFiles() throws IOException { this.conf = new Configuration(); this.fs = FileSystem.getLocal(this.conf); this.stagingDirs = Lists.newArrayList(); // Create a list of WorkUnits to serialize WorkUnit wu1 = createAndSetWorkUnit("wu1"); WorkUnit wu2 = createAndSetWorkUnit("wu2"); WorkUnit wu3 = createAndSetWorkUnit("wu3"); WorkUnit wu4 = createAndSetWorkUnit("wu4"); // Create a MultiWorkUnit to serialize MultiWorkUnit mwu1 = MultiWorkUnit.createEmpty(); mwu1.setProp(ConfigurationKeys.TASK_ID_KEY, System.nanoTime()); mwu1.addWorkUnits(Arrays.asList(wu3, wu4)); Path inputDir = new Path(new Path(OUTPUT_PATH, JOB_NAME), "input"); // Writer each WorkUnit to a separate file under inputDir Closer closer = Closer.create(); try { wu1.write(closer.register(this.fs .create(new Path(inputDir, wu1.getProp(ConfigurationKeys.TASK_ID_KEY) + Path.SEPARATOR + "_").suffix("wu")))); wu2.write(closer.register(this.fs .create(new Path(inputDir, wu2.getProp(ConfigurationKeys.TASK_ID_KEY) + Path.SEPARATOR + "_").suffix("wu")))); mwu1.write(closer.register(this.fs.create( new Path(inputDir, mwu1.getProp(ConfigurationKeys.TASK_ID_KEY) + Path.SEPARATOR + "_").suffix("mwu")))); } finally { closer.close(); } }
ByteArrayOutputStream baos = closer.register(new ByteArrayOutputStream()); DataOutputStream dos = closer.register(new DataOutputStream(baos)); this.multiWorkUnit.write(dos); MultiWorkUnit copy = new MultiWorkUnit(); copy.readFields(dis); List<WorkUnit> workUnitList = copy.getWorkUnits(); Assert.assertEquals(workUnitList.size(), 2);
private WorkUnit createWorkUnit(String... names) { if (names.length == 1) { WorkUnit workUnit = new WorkUnit(); workUnit.setProp(WORK_UNIT_NAME, names[0]); return workUnit; } MultiWorkUnit mwu = new MultiWorkUnit(); for (String name : names) { mwu.addWorkUnit(createWorkUnit(name)); } return mwu; }
@Test public void testFlattenWorkUnits() { List<WorkUnit> workUnitsOnly = Arrays.asList(WorkUnit.createEmpty(), WorkUnit.createEmpty(), WorkUnit.createEmpty()); Assert.assertEquals(JobLauncherUtils.flattenWorkUnits(workUnitsOnly).size(), 3); MultiWorkUnit multiWorkUnit1 = MultiWorkUnit.createEmpty(); multiWorkUnit1.addWorkUnits(Arrays.asList(WorkUnit.createEmpty(), WorkUnit.createEmpty(), WorkUnit.createEmpty())); MultiWorkUnit multiWorkUnit2 = MultiWorkUnit.createEmpty(); multiWorkUnit1.addWorkUnits(Arrays.asList(WorkUnit.createEmpty(), WorkUnit.createEmpty(), WorkUnit.createEmpty())); List<WorkUnit> workUnitsAndMultiWorkUnits = Arrays.asList(WorkUnit.createEmpty(), WorkUnit.createEmpty(), WorkUnit.createEmpty(), multiWorkUnit1, multiWorkUnit2); Assert.assertEquals(JobLauncherUtils.flattenWorkUnits(workUnitsAndMultiWorkUnits).size(), 9); }
private static void addToMultiWorkUnit(MultiWorkUnit multiWorkUnit, WorkUnit workUnit, long weight) { multiWorkUnit.addWorkUnit(workUnit); setMultiWorkUnitWeight(multiWorkUnit, getMultiWorkUnitWeight(multiWorkUnit) + weight); }
private static void setMultiWorkUnitWeight(MultiWorkUnit multiWorkUnit, long weight) { multiWorkUnit.setProp(TOTAL_MULTI_WORK_UNIT_WEIGHT, Long.toString(weight)); }
/** * Create a new empty {@link MultiWorkUnit} instance. * * @return a new empty {@link MultiWorkUnit} instance */ public static MultiWorkUnit createEmpty() { return new MultiWorkUnit(); } }
private static List<KafkaPartition> getPartitionsFromMultiWorkUnit(MultiWorkUnit multiWorkUnit) { List<KafkaPartition> partitions = Lists.newArrayList(); for (WorkUnit workUnit : multiWorkUnit.getWorkUnits()) { partitions.add(KafkaUtils.getPartition(workUnit)); } return partitions; }
MultiWorkUnit mwu = MultiWorkUnit.createEmpty(); try { mwu.readFields(workUnitFileCloser.register(new DataInputStream(fs.open(status.getPath())))); } finally { workUnitFileCloser.close(); for (WorkUnit wu : mwu.getWorkUnits()) { JobLauncherUtils.cleanTaskStagingData(new WorkUnitState(wu), LOG);
@Override public List<WorkUnit> pack(Map<String, List<WorkUnit>> workUnitsByTopic, int numContainers) { double totalEstDataSize = setWorkUnitEstSizes(workUnitsByTopic); double avgGroupSize = totalEstDataSize / numContainers / getPreGroupingSizeFactor(this.state); List<MultiWorkUnit> mwuGroups = Lists.newArrayList(); for (List<WorkUnit> workUnitsForTopic : workUnitsByTopic.values()) { double estimatedDataSizeForTopic = calcTotalEstSizeForTopic(workUnitsForTopic); if (estimatedDataSizeForTopic < avgGroupSize) { // If the total estimated size of a topic is smaller than group size, put all partitions of this // topic in a single group. MultiWorkUnit mwuGroup = MultiWorkUnit.createEmpty(); addWorkUnitsToMultiWorkUnit(workUnitsForTopic, mwuGroup); mwuGroups.add(mwuGroup); } else { // Use best-fit-decreasing to group workunits for a topic into multiple groups. mwuGroups.addAll(bestFitDecreasingBinPacking(workUnitsForTopic, avgGroupSize)); } } List<WorkUnit> groups = squeezeMultiWorkUnits(mwuGroups); return worstFitDecreasingBinPacking(groups, numContainers); }
@BeforeClass public void setUp() { this.multiWorkUnit = new MultiWorkUnit(); WorkUnit workUnit1 = WorkUnit.createEmpty(); workUnit1.setHighWaterMark(1000); workUnit1.setLowWaterMark(0); workUnit1.setProp("k1", "v1"); this.multiWorkUnit.addWorkUnit(workUnit1); WorkUnit workUnit2 = WorkUnit.createEmpty(); workUnit2.setHighWaterMark(2000); workUnit2.setLowWaterMark(1001); workUnit2.setProp("k2", "v2"); this.multiWorkUnit.addWorkUnit(workUnit2); }
@Override public List<WorkUnit> getWorkunits(SourceState state) { String nameSpace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY); Extract extract1 = createExtract(TableType.SNAPSHOT_ONLY, nameSpace, "TestTable1"); Extract extract2 = createExtract(TableType.SNAPSHOT_ONLY, nameSpace, "TestTable2"); String sourceFileList = state.getProp(SOURCE_FILE_LIST_KEY); List<String> list = SPLITTER.splitToList(sourceFileList); List<WorkUnit> workUnits = Lists.newArrayList(); for (int i = 0; i < list.size(); i++) { WorkUnit workUnit = WorkUnit.create(i % 2 == 0 ? extract1 : extract2); workUnit.setProp(SOURCE_FILE_KEY, list.get(i)); workUnits.add(workUnit); } if (state.getPropAsBoolean("use.multiworkunit", false)) { MultiWorkUnit multiWorkUnit = MultiWorkUnit.createEmpty(); multiWorkUnit.addWorkUnits(workUnits); workUnits.clear(); workUnits.add(multiWorkUnit); } return workUnits; }
private static void addToMultiWorkUnit(MultiWorkUnit multiWorkUnit, WorkUnit workUnit, long weight) { multiWorkUnit.addWorkUnit(workUnit); setMultiWorkUnitWeight(multiWorkUnit, getMultiWorkUnitWeight(multiWorkUnit) + weight); }
private static void setMultiWorkUnitWeight(MultiWorkUnit multiWorkUnit, long weight) { multiWorkUnit.setProp(TOTAL_MULTI_WORK_UNIT_WEIGHT, Long.toString(weight)); }