Refine search
@Override public RecordReader<BytesWritable, BytesWritable> getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException { String inputPathString = ((FileSplit) split).getPath().toUri().getPath(); log.info("Input file path:" + inputPathString); Path inputPath = new Path(inputPathString); SequenceFile.Reader reader = new SequenceFile.Reader(inputPath.getFileSystem(conf), inputPath, conf); SequenceFile.Metadata meta = reader.getMetadata(); try { Text keySchema = meta.get(new Text("key.schema")); Text valueSchema = meta.get(new Text("value.schema")); if(0 == keySchema.getLength() || 0 == valueSchema.getLength()) { throw new Exception(); } // update Joboconf with schemas conf.set("mapper.input.key.schema", keySchema.toString()); conf.set("mapper.input.value.schema", valueSchema.toString()); } catch(Exception e) { throw new IOException("Failed to Load Schema from file:" + inputPathString + "\n"); } return super.getRecordReader(split, conf, reporter); }
public Boolean readFileWithCache(long startTime) throws IOException, InterruptedException { if (fileKey == null) return false; BooleanRef gotAllData = new BooleanRef(); long endOfSplit = split.getStart() + split.getLength(); this.cachedData = cache.getFileData(fileKey, split.getStart(), endOfSplit, writerIncludes, CC_FACTORY, counters, gotAllData); if (cachedData == null) { uncachedSuffixStart = slices.get(slices.size() - 1).getLastEnd(); Ref<Integer> stripeIx = Ref.from(0); if (uncachedPrefixEnd > split.getStart()) { FileSplit sliceSplit = new FileSplit(split.getPath(), split.getStart(), uncachedPrefixEnd - split.getStart(), hosts, inMemoryHosts); if (!processOneFileSplit(sliceSplit, startTime, stripeIx, null)) return null; long start = slice.getKnownTornStart(); long len = slice.getLastStart() - start; // Will also read the last row. FileSplit sliceSplit = new FileSplit(split.getPath(), start, len, hosts, inMemoryHosts); if (!processOneFileSplit(sliceSplit, startTime, stripeIx, slice)) return null; long size = split.getPath().getFileSystem( daemonConf).getFileStatus(split.getPath()).getLen(); isUnfortunate = size > endOfSplit; if (isUnfortunate) { LlapIoImpl.LOG.warn("One-row mismatch at the end of split " + split.getPath() + " at " + endOfSplit + "; file size is " + size); FileSplit splitPart = new FileSplit(split.getPath(), uncachedSuffixStart,
/** * @param clsName Input split class name. * @param in Input stream. * @param hosts Optional hosts. * @return File block or {@code null} if it is not a {@link FileSplit} instance. * @throws IgniteCheckedException If failed. */ @Nullable public static HadoopFileBlock readFileBlock(String clsName, FSDataInputStream in, @Nullable String[] hosts) throws IgniteCheckedException { if (!FileSplit.class.getName().equals(clsName)) return null; FileSplit split = U.newInstance(FileSplit.class); try { split.readFields(in); } catch (IOException e) { throw new IgniteCheckedException(e); } if (hosts == null) hosts = EMPTY_HOSTS; return new HadoopFileBlock(hosts, split.getPath().toUri(), split.getStart(), split.getLength()); } }
public Optional<InternalHiveSplit> createInternalHiveSplit(FileSplit split) throws IOException { FileStatus file = fileSystem.getFileStatus(split.getPath()); return createInternalHiveSplit( split.getPath(), fileSystem.getFileBlockLocations(file, split.getStart(), split.getLength()), split.getStart(), split.getLength(), file.getLen(), OptionalInt.empty(), false); }
fileSizes = true; } else if (fileName == null){ fileName = new Path(arg); } else { printUsage(null); FileSystem fs = FileSystem.get(fileName.toUri(), conf); long fileLen = fs.getFileStatus(fileName).getLen(); if (start < 0) { start = 0; FileSplit split = new FileSplit(fileName,start, length, new JobConf(conf)); RCFileRecordReader recordReader = new RCFileRecordReader(conf, split);
public ArrayList<String> readRecords(URL testFileUrl, int splitSize) throws IOException { // Set up context File testFile = new File(testFileUrl.getFile()); long testFileSize = testFile.length(); Path testFilePath = new Path(testFile.getAbsolutePath()); Configuration conf = new Configuration(); conf.setInt("io.file.buffer.size", 1); // Gather the records returned by the record reader ArrayList<String> records = new ArrayList<String>(); long offset = 0; LongWritable key = new LongWritable(); Text value = new Text(); while (offset < testFileSize) { FileSplit split = new FileSplit(testFilePath, offset, splitSize, (String[]) null); LineRecordReader reader = new LineRecordReader(conf, split); while (reader.next(key, value)) { records.add(value.toString()); } offset += splitSize; } return records; }
try { boolean sendSerializedEvents = conf.getBoolean("mapreduce.tez.input.initializer.serialize.event.payload", true); boolean generateConsistentSplits = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_TEZ_GENERATE_CONSISTENT_SPLITS); LOG.info("GenerateConsistentSplitsInHive=" + generateConsistentSplits); String realInputFormatName = conf.get("mapred.input.format.class"); boolean groupingEnabled = userPayloadProto.getGroupingEnabled(); if (groupingEnabled) { final long blockSize = conf.getLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, FileSplit fileSplit = new FileSplit(fileStatus.getPath(), 0, fileStatus.getLen(), hosts); String alias = mapWork.getAliases().get(0); PartitionDesc partDesc = mapWork.getAliasToPartnInfo().get(alias); if (inputSplit instanceof FileSplit) { final FileSplit fileSplit = (FileSplit) inputSplit; final Path path = fileSplit.getPath(); final String fileStr = path.toString(); if (!files.contains(fileStr)) { files.add(fileStr);
Path dir = new Path(tPart.getSd().getLocation()); long numRows = 0; long rawDataSize = 0; long fileSize = 0; long numFiles = 0; FileSystem fs = dir.getFileSystem(conf); FileStatus[] fileList = HiveStatsUtils.getFileStatusRecurse(dir, -1, fs); if (!file.isDir()) { InputFormat<?, ?> inputFormat = ReflectionUtil.newInstance( partn.getInputFormatClass(), jc); InputSplit dummySplit = new FileSplit(file.getPath(), 0, 0, new String[] { partn.getLocation() }); org.apache.hadoop.mapred.RecordReader<?, ?> recordReader = rawDataSize += statsRR.getStats().getRawDataSize(); numRows += statsRR.getStats().getRowCount(); fileSize += file.getLen(); numFiles += 1; statsAvailable = true;
@SuppressWarnings("unchecked") // InputFormat instantiation static long readBench(JobConf conf) throws IOException { InputFormat inf = conf.getInputFormat(); final String fn = conf.get("test.filebench.name", ""); Path pin = new Path(FileInputFormat.getInputPaths(conf)[0], fn); FileStatus in = pin.getFileSystem(conf).getFileStatus(pin); RecordReader rr = inf.getRecordReader(new FileSplit(pin, 0, in.getLen(), (String[])null), conf, Reporter.NULL); try { Object key = rr.createKey(); Object val = rr.createValue(); Date start = new Date(); while (rr.next(key, val)); Date end = new Date(); return end.getTime() - start.getTime(); } finally { rr.close(); } }
public OmnitureDataFileRecordReader(Configuration job, FileSplit split) throws IOException { this.maxLineLength = job.getInt("mapred.escapedlinereader.maxlength", Integer.MAX_VALUE); this.start = split.getStart(); this.end = start + split.getLength(); final Path file = split.getPath(); CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // Open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { lineReader = new EscapedLineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } lineReader = new EscapedLineReader(fileIn, job); } if (skipFirstLine) { start += lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
public ConnectorPageSource newPageSource(FileFormatDataSourceStats stats, ConnectorSession session) { OrcPageSourceFactory orcPageSourceFactory = new OrcPageSourceFactory(TYPE_MANAGER, false, HDFS_ENVIRONMENT, stats); return HivePageSourceProvider.createHivePageSource( ImmutableSet.of(), ImmutableSet.of(orcPageSourceFactory), new Configuration(), session, fileSplit.getPath(), OptionalInt.empty(), fileSplit.getStart(), fileSplit.getLength(), fileSplit.getLength(), schema, TupleDomain.all(), columns, partitionKeys, DateTimeZone.UTC, TYPE_MANAGER, ImmutableMap.of(), Optional.empty(), false) .get(); }
@Override public List<WorkUnit> getWorkunits(SourceState state) { JobConf jobConf = new JobConf(new Configuration()); for (String key : state.getPropertyNames()) { jobConf.set(key, state.getProp(key)); FileInputFormat.addInputPath(jobConf, new Path(inputPath)); WorkUnit workUnit = WorkUnit.create(extract); workUnit.setProp(HadoopFileInputSource.FILE_SPLIT_BYTES_STRING_KEY, HadoopUtils.serializeToString(fileSplit)); workUnit.setProp(HadoopFileInputSource.FILE_SPLIT_PATH_KEY, fileSplit.getPath().toString()); workUnits.add(workUnit);
public AbstractFeatureReader( final InputSplit inputSplit, final JobConf jobConf) throws IOException { super(inputSplit, jobConf); // No need to check if instance of FileSplit as it is done in super class. final FileSplit fileSplit = (FileSplit) inputSplit; // Get .shp file final Path shpPath = fileSplit.getPath(); final String dbfName = shpPath.getName().replace(".shp", ".dbf"); final Path dbfPath = new Path(shpPath.getParent(), dbfName); m_dbfStream = dbfPath.getFileSystem(jobConf).open(dbfPath); m_dbfReader = new DBFReader(m_dbfStream); // Create a list of field name as Hadoop Text instances final List<DBFField> fields = m_dbfReader.getFields(); m_keys = new ArrayList<Text>(fields.size()); for (final DBFField field : fields) { m_keys.add(new Text(field.fieldName)); } }
tableProperties.setProperty("columns", Joiner.on(',').join(transform(testColumns, TestColumn::getName))); tableProperties.setProperty("columns.types", Joiner.on(',').join(transform(testColumns, TestColumn::getType))); serDe.initialize(new Configuration(), tableProperties); JobConf jobConf = new JobConf(); configureCompression(jobConf, compressionCodec); new Path(filePath), Text.class, compressionCodec != HiveCompressionCodec.NONE, serDe.initialize(new Configuration(), tableProperties); Path path = new Path(filePath); path.getFileSystem(new Configuration()).setVerifyChecksum(true); File file = new File(filePath); return new FileSplit(path, 0, file.length(), new String[0]);
private void setIncrementalConfigParams(InputSplit inputSplit) { if (inputSplit instanceof FileSplit) { FileSplit fileSplit = (FileSplit) inputSplit; this.incrementalConf = new Configuration(false); this.incrementalConf.set(JobContext.MAP_INPUT_FILE, fileSplit.getPath().toString()); this.incrementalConf.setLong(JobContext.MAP_INPUT_START, fileSplit.getStart()); this.incrementalConf.setLong(JobContext.MAP_INPUT_PATH, fileSplit.getLength()); } LOG.info("Processing split: " + inputSplit); } }
throws Exception JobConf configuration = new JobConf(new Configuration(false)); configuration.set(READ_COLUMN_IDS_CONF_STR, "0"); configuration.setBoolean(READ_ALL_COLUMNS, false); new FileSplit(new Path(tempFile.getFile().getAbsolutePath()), 0, tempFile.getFile().length(), (String[]) null), configuration, NULL);
@Test public void testMaxBlockLocationsOldSplits() throws Exception { TEST_DIR.mkdirs(); try { Configuration conf = new Configuration(); conf.setInt(MRConfig.MAX_BLOCK_LOCATIONS_KEY, 4); Path submitDir = new Path(TEST_DIR.getAbsolutePath()); FileSystem fs = FileSystem.getLocal(conf); org.apache.hadoop.mapred.FileSplit split = new org.apache.hadoop.mapred.FileSplit(new Path("/some/path"), 0, 1, new String[] { "loc1", "loc2", "loc3", "loc4", "loc5" }); JobSplitWriter.createSplitFiles(submitDir, conf, fs, new org.apache.hadoop.mapred.InputSplit[] { split }); JobSplit.TaskSplitMetaInfo[] infos = SplitMetaInfoReader.readSplitMetaInfo(new JobID(), fs, conf, submitDir); assertEquals("unexpected number of splits", 1, infos.length); assertEquals("unexpected number of split locations", 4, infos[0].getLocations().length); } finally { FileUtil.fullyDelete(TEST_DIR); } } }
serDe.initialize(CONFIGURATION, tableProperties); JobConf jobConf = new JobConf(); if (compressionCodec != null) { CompressionCodec codec = new CompressionCodecFactory(CONFIGURATION).getCodecByName(compressionCodec); jobConf.set(COMPRESS_CODEC, codec.getClass().getName()); jobConf.set(COMPRESS_TYPE, SequenceFile.CompressionType.BLOCK.toString()); RecordWriter recordWriter = createRecordWriter(new Path(filePath), CONFIGURATION); Path path = new Path(filePath); path.getFileSystem(CONFIGURATION).setVerifyChecksum(true); File file = new File(filePath); return new FileSplit(path, 0, file.length(), new String[0]);