@Override public void write(scala.collection.Iterator<Product2<K, V>> records) throws IOException { // Keep track of success so we know if we encountered an exception // We do this rather than a standard try/catch/re-throw to handle // generic throwables. boolean success = false; try { while (records.hasNext()) { insertRecordIntoSorter(records.next()); } closeAndWriteOutput(); success = true; } finally { if (sorter != null) { try { sorter.cleanupResources(); } catch (Exception e) { // Only throw this error if we won't be masking another // error. if (success) { throw e; } else { logger.error("In addition to a failure during writing, we failed during " + "cleanup.", e); } } } } }
@Test public void writeRecordsThatAreBiggerThanDiskWriteBufferSize() throws Exception { final UnsafeShuffleWriter<Object, Object> writer = createWriter(false); final ArrayList<Product2<Object, Object>> dataToWrite = new ArrayList<>(); final byte[] bytes = new byte[(int) (ShuffleExternalSorter.DISK_WRITE_BUFFER_SIZE * 2.5)]; new Random(42).nextBytes(bytes); dataToWrite.add(new Tuple2<>(1, ByteBuffer.wrap(bytes))); writer.write(dataToWrite.iterator()); writer.stop(true); assertEquals( HashMultiset.create(dataToWrite), HashMultiset.create(readRecordsFromFile())); assertSpillFilesWereCleanedUp(); }
@Test public void spillFilesAreDeletedWhenStoppingAfterError() throws IOException { final UnsafeShuffleWriter<Object, Object> writer = createWriter(false); writer.insertRecordIntoSorter(new Tuple2<>(1, 1)); writer.insertRecordIntoSorter(new Tuple2<>(2, 2)); writer.forceSorterToSpill(); writer.insertRecordIntoSorter(new Tuple2<>(2, 2)); writer.stop(false); assertSpillFilesWereCleanedUp(); }
@VisibleForTesting void closeAndWriteOutput() throws IOException { assert(sorter != null); updatePeakMemoryUsed(); serBuffer = null; serOutputStream = null; final SpillInfo[] spills = sorter.closeAndGetSpills(); sorter = null; final long[] partitionLengths; final File output = shuffleBlockResolver.getDataFile(shuffleId, mapId); final File tmp = Utils.tempFileWith(output); try { try { partitionLengths = mergeSpills(spills, tmp); } finally { for (SpillInfo spill : spills) { if (spill.file.exists() && ! spill.file.delete()) { logger.error("Error while deleting spill file {}", spill.file.getPath()); } } } shuffleBlockResolver.writeIndexFileAndCommit(shuffleId, mapId, partitionLengths, tmp); } finally { if (tmp.exists() && !tmp.delete()) { logger.error("Error while deleting temp file {}", tmp.getAbsolutePath()); } } mapStatus = MapStatus$.MODULE$.apply(blockManager.shuffleServerId(), partitionLengths); }
when(taskMemoryManager.pageSizeBytes()).thenReturn(pageSizeBytes); final UnsafeShuffleWriter<Object, Object> writer = new UnsafeShuffleWriter<>( blockManager, shuffleBlockResolver, long previousPeakMemory = writer.getPeakMemoryUsedBytes(); long newPeakMemory; try { for (int i = 0; i < numRecordsPerPage * 10; i++) { writer.insertRecordIntoSorter(new Tuple2<Object, Object>(1, 1)); newPeakMemory = writer.getPeakMemoryUsedBytes(); if (i % numRecordsPerPage == 0) { writer.forceSorterToSpill(); newPeakMemory = writer.getPeakMemoryUsedBytes(); assertEquals(previousPeakMemory, newPeakMemory); for (int i = 0; i < numRecordsPerPage; i++) { writer.insertRecordIntoSorter(new Tuple2<Object, Object>(1, 1)); newPeakMemory = writer.getPeakMemoryUsedBytes(); assertEquals(previousPeakMemory, newPeakMemory); writer.closeAndWriteOutput(); newPeakMemory = writer.getPeakMemoryUsedBytes(); assertEquals(previousPeakMemory, newPeakMemory); } finally { writer.stop(false);
dataToWrite.add(new Tuple2<>(i, i)); writer.insertRecordIntoSorter(dataToWrite.get(0)); writer.insertRecordIntoSorter(dataToWrite.get(1)); writer.insertRecordIntoSorter(dataToWrite.get(2)); writer.insertRecordIntoSorter(dataToWrite.get(3)); writer.forceSorterToSpill(); writer.insertRecordIntoSorter(dataToWrite.get(4)); writer.insertRecordIntoSorter(dataToWrite.get(5)); writer.closeAndWriteOutput(); final Option<MapStatus> mapStatus = writer.stop(true); assertTrue(mapStatus.isDefined()); assertTrue(mergedOutputFile.exists());
/** * This convenience method should only be called in test code. */ @VisibleForTesting public void write(Iterator<Product2<K, V>> records) throws IOException { write(JavaConverters.asScalaIteratorConverter(records).asScala()); }
@Override public Option<MapStatus> stop(boolean success) { try { taskContext.taskMetrics().incPeakExecutionMemory(getPeakMemoryUsedBytes()); if (stopping) { return Option.apply(null); } else { stopping = true; if (success) { if (mapStatus == null) { throw new IllegalStateException("Cannot call stop(true) without having called write()"); } return Option.apply(mapStatus); } else { return Option.apply(null); } } } finally { if (sorter != null) { // If sorter is non-null, then this implies that we called stop() in response to an error, // so we need to clean up memory and spill files created by the sorter sorter.cleanupResources(); } } } }
partitionLengths = mergeSpillsWithTransferTo(spills, outputFile); } else { logger.debug("Using fileStream-based fast merge"); partitionLengths = mergeSpillsWithFileStream(spills, outputFile, null); partitionLengths = mergeSpillsWithFileStream(spills, outputFile, compressionCodec);
private UnsafeShuffleWriter<Object, Object> createWriter( boolean transferToEnabled) throws IOException { conf.set("spark.file.transferTo", String.valueOf(transferToEnabled)); return new UnsafeShuffleWriter<>( blockManager, shuffleBlockResolver, taskMemoryManager, new SerializedShuffleHandle<>(0, 1, shuffleDep), 0, // map id taskContext, conf ); }
this.initialSortBufferSize = sparkConf.getInt("spark.shuffle.sort.initialBufferSize", DEFAULT_INITIAL_SORT_BUFFER_SIZE); open();
when(taskMemoryManager.pageSizeBytes()).thenReturn(pageSizeBytes); final UnsafeShuffleWriter<Object, Object> writer = new UnsafeShuffleWriter<>( blockManager, shuffleBlockResolver, long previousPeakMemory = writer.getPeakMemoryUsedBytes(); long newPeakMemory; try { for (int i = 0; i < numRecordsPerPage * 10; i++) { writer.insertRecordIntoSorter(new Tuple2<Object, Object>(1, 1)); newPeakMemory = writer.getPeakMemoryUsedBytes(); if (i % numRecordsPerPage == 0) { writer.forceSorterToSpill(); newPeakMemory = writer.getPeakMemoryUsedBytes(); assertEquals(previousPeakMemory, newPeakMemory); for (int i = 0; i < numRecordsPerPage; i++) { writer.insertRecordIntoSorter(new Tuple2<Object, Object>(1, 1)); newPeakMemory = writer.getPeakMemoryUsedBytes(); assertEquals(previousPeakMemory, newPeakMemory); writer.closeAndWriteOutput(); newPeakMemory = writer.getPeakMemoryUsedBytes(); assertEquals(previousPeakMemory, newPeakMemory); } finally { writer.stop(false);
dataToWrite.add(new Tuple2<>(i, i)); writer.insertRecordIntoSorter(dataToWrite.get(0)); writer.insertRecordIntoSorter(dataToWrite.get(1)); writer.insertRecordIntoSorter(dataToWrite.get(2)); writer.insertRecordIntoSorter(dataToWrite.get(3)); writer.forceSorterToSpill(); writer.insertRecordIntoSorter(dataToWrite.get(4)); writer.insertRecordIntoSorter(dataToWrite.get(5)); writer.closeAndWriteOutput(); final Option<MapStatus> mapStatus = writer.stop(true); assertTrue(mapStatus.isDefined()); assertTrue(mergedOutputFile.exists());
/** * This convenience method should only be called in test code. */ @VisibleForTesting public void write(Iterator<Product2<K, V>> records) throws IOException { write(JavaConverters.asScalaIteratorConverter(records).asScala()); }
@Override public Option<MapStatus> stop(boolean success) { try { taskContext.taskMetrics().incPeakExecutionMemory(getPeakMemoryUsedBytes()); if (stopping) { return Option.apply(null); } else { stopping = true; if (success) { if (mapStatus == null) { throw new IllegalStateException("Cannot call stop(true) without having called write()"); } return Option.apply(mapStatus); } else { return Option.apply(null); } } } finally { if (sorter != null) { // If sorter is non-null, then this implies that we called stop() in response to an error, // so we need to clean up memory and spill files created by the sorter sorter.cleanupResources(); } } } }
@VisibleForTesting void closeAndWriteOutput() throws IOException { assert(sorter != null); updatePeakMemoryUsed(); serBuffer = null; serOutputStream = null; final SpillInfo[] spills = sorter.closeAndGetSpills(); sorter = null; final long[] partitionLengths; final File output = shuffleBlockResolver.getDataFile(shuffleId, mapId); final File tmp = Utils.tempFileWith(output); try { try { partitionLengths = mergeSpills(spills, tmp); } finally { for (SpillInfo spill : spills) { if (spill.file.exists() && ! spill.file.delete()) { logger.error("Error while deleting spill file {}", spill.file.getPath()); } } } shuffleBlockResolver.writeIndexFileAndCommit(shuffleId, mapId, partitionLengths, tmp); } finally { if (tmp.exists() && !tmp.delete()) { logger.error("Error while deleting temp file {}", tmp.getAbsolutePath()); } } mapStatus = MapStatus$.MODULE$.apply(blockManager.shuffleServerId(), partitionLengths); }
partitionLengths = mergeSpillsWithTransferTo(spills, outputFile); } else { logger.debug("Using fileStream-based fast merge"); partitionLengths = mergeSpillsWithFileStream(spills, outputFile, null); partitionLengths = mergeSpillsWithFileStream(spills, outputFile, compressionCodec);
private UnsafeShuffleWriter<Object, Object> createWriter( boolean transferToEnabled) throws IOException { conf.set("spark.file.transferTo", String.valueOf(transferToEnabled)); return new UnsafeShuffleWriter<>( blockManager, shuffleBlockResolver, taskMemoryManager, new SerializedShuffleHandle<>(0, 1, shuffleDep), 0, // map id taskContext, conf ); }
this.outputBufferSizeInBytes = (int) (long) sparkConf.get(package$.MODULE$.SHUFFLE_UNSAFE_FILE_OUTPUT_BUFFER_SIZE()) * 1024; open();
when(taskMemoryManager.pageSizeBytes()).thenReturn(pageSizeBytes); final UnsafeShuffleWriter<Object, Object> writer = new UnsafeShuffleWriter<>( blockManager, shuffleBlockResolver, long previousPeakMemory = writer.getPeakMemoryUsedBytes(); long newPeakMemory; try { for (int i = 0; i < numRecordsPerPage * 10; i++) { writer.insertRecordIntoSorter(new Tuple2<Object, Object>(1, 1)); newPeakMemory = writer.getPeakMemoryUsedBytes(); if (i % numRecordsPerPage == 0) { writer.forceSorterToSpill(); newPeakMemory = writer.getPeakMemoryUsedBytes(); assertEquals(previousPeakMemory, newPeakMemory); for (int i = 0; i < numRecordsPerPage; i++) { writer.insertRecordIntoSorter(new Tuple2<Object, Object>(1, 1)); newPeakMemory = writer.getPeakMemoryUsedBytes(); assertEquals(previousPeakMemory, newPeakMemory); writer.closeAndWriteOutput(); newPeakMemory = writer.getPeakMemoryUsedBytes(); assertEquals(previousPeakMemory, newPeakMemory); } finally { writer.stop(false);