@Override public org.apache.hadoop.hive.ql.io.orc.CompressionKind getCompression() { for (CompressionKind value: org.apache.hadoop.hive.ql.io.orc.CompressionKind.values()) { if (value.getUnderlying() == compressionKind) { return value; } } throw new IllegalArgumentException("Unknown compression kind " + compressionKind); }
final long stripeSize = context.getProperty(STRIPE_SIZE).asDataSize(DataUnit.B).longValue(); final int bufferSize = context.getProperty(BUFFER_SIZE).asDataSize(DataUnit.B).intValue(); final CompressionKind compressionType = CompressionKind.valueOf(context.getProperty(COMPRESSION_TYPE).getValue()); final AtomicReference<Schema> hiveAvroSchema = new AtomicReference<>(null); final AtomicInteger totalRecordCount = new AtomicInteger(0);
private void consumeCompressionKind( DmdlSemantics environment, AstAttribute attribute, Map<String, AstAttributeElement> elements, OrcFileTrait result) { CompressionKind option = consumeOption( environment, attribute, elements, ELEMENT_COMPRESSION_KIND, Messages.getString("OrcFileDriver.labelCompression"), //$NON-NLS-1$ CompressionKind.values()); if (option != null) { result.configuration().withCompressionKind(option); } }
/** * Sets the generic compression that is used to compress the data. */ public WriterOptions compress(CompressionKind value) { super.compress(value.getUnderlying()); return this; }
public FileMetaInfo getFileMetaInfo(){ return new FileMetaInfo(compressionKind.toString(), bufferSize, metadataSize, footerByteBuffer, versionList, writerVersion); }
public static void configureCompression(Configuration config, HiveCompressionCodec compressionCodec) { boolean compression = compressionCodec != HiveCompressionCodec.NONE; config.setBoolean(COMPRESSRESULT.varname, compression); config.setBoolean("mapred.output.compress", compression); config.setBoolean(FileOutputFormat.COMPRESS, compression); // For DWRF config.set(HIVE_ORC_DEFAULT_COMPRESS.varname, compressionCodec.getOrcCompressionKind().name()); config.set(HIVE_ORC_COMPRESSION.varname, compressionCodec.getOrcCompressionKind().name()); // For ORC config.set(OrcTableProperties.COMPRESSION.getPropName(), compressionCodec.getOrcCompressionKind().name()); // For RCFile if (compressionCodec.getCodec().isPresent()) { config.set("mapred.output.compression.codec", compressionCodec.getCodec().get().getName()); config.set(FileOutputFormat.COMPRESS_CODEC, compressionCodec.getCodec().get().getName()); } else { config.unset("mapred.output.compression.codec"); config.unset(FileOutputFormat.COMPRESS_CODEC); } // For Parquet config.set(ParquetOutputFormat.COMPRESSION, compressionCodec.getParquetCompressionCodec().name()); // For SequenceFile config.set(FileOutputFormat.COMPRESS_TYPE, BLOCK.toString()); }
private boolean checkCompatibility(OrcFileKeyWrapper k) { // check compatibility with subsequent files if ((k.getTypes().get(0).getSubtypesCount() != columnCount)) { LOG.warn("Incompatible ORC file merge! Column counts mismatch for " + k.getInputPath()); return false; } if (!k.getCompression().equals(compression)) { LOG.warn("Incompatible ORC file merge! Compression codec mismatch for " + k.getInputPath()); return false; } if (k.getCompressBufferSize() != compressBuffSize) { LOG.warn("Incompatible ORC file merge! Compression buffer size mismatch for " + k.getInputPath()); return false; } if (!k.getVersion().equals(version)) { LOG.warn("Incompatible ORC file merge! Version mismatch for " + k.getInputPath()); return false; } if (k.getRowIndexStride() != rowIndexStride) { LOG.warn("Incompatible ORC file merge! Row index stride mismatch for " + k.getInputPath()); return false; } return true; }
@Override public HDFSRecordWriter createHDFSRecordWriter(final ProcessContext context, final FlowFile flowFile, final Configuration conf, final Path path, final RecordSchema schema) throws IOException, SchemaNotFoundException { final long stripeSize = context.getProperty(STRIPE_SIZE).asDataSize(DataUnit.B).longValue(); final int bufferSize = context.getProperty(BUFFER_SIZE).asDataSize(DataUnit.B).intValue(); final CompressionKind compressionType = CompressionKind.valueOf(context.getProperty(COMPRESSION_TYPE).getValue()); final boolean normalizeForHive = context.getProperty(HIVE_FIELD_NAMES).asBoolean(); TypeInfo orcSchema = NiFiOrcUtils.getOrcSchema(schema, normalizeForHive); final Writer orcWriter = NiFiOrcUtils.createWriter(path, conf, orcSchema, stripeSize, compressionType, bufferSize); final String hiveTableName = context.getProperty(HIVE_TABLE_NAME).isSet() ? context.getProperty(HIVE_TABLE_NAME).evaluateAttributeExpressions(flowFile).getValue() : NiFiOrcUtils.normalizeHiveTableName(schema.getIdentifier().getName().orElse("unknown")); final boolean hiveFieldNames = context.getProperty(HIVE_FIELD_NAMES).asBoolean(); return new ORCHDFSRecordWriter(orcWriter, schema, hiveTableName, hiveFieldNames); } }
/** * Sets the generic compression that is used to compress the data. */ public WriterOptions compress(CompressionKind value) { super.compress(value.getUnderlying()); return this; }
compress.toString());
.method("withCompressionKind", //$NON-NLS-1$ new TypeBuilder(f, context.resolve(CompressionKind.class)) .field(conf.getCompressionKind().name()) .toExpression()) .toStatement());
public OrcStorage(String options) { String[] optsArr = options.split(" "); try { CommandLine configuredOptions = parser.parse(validOptions, optsArr); if (configuredOptions.hasOption('s')) { stripeSize = Long.parseLong(configuredOptions.getOptionValue('s')); } if (configuredOptions.hasOption('r')) { rowIndexStride = Integer.parseInt(configuredOptions.getOptionValue('r')); } if (configuredOptions.hasOption('b')) { bufferSize = Integer.parseInt(configuredOptions.getOptionValue('b')); } blockPadding = configuredOptions.hasOption('p'); if (configuredOptions.hasOption('c')) { compress = CompressionKind.valueOf(configuredOptions.getOptionValue('c')); } if (configuredOptions.hasOption('v')) { version = Version.byName(configuredOptions.getOptionValue('v')); } } catch (ParseException e) { log.error("Exception in OrcStorage", e); log.error("OrcStorage called with arguments " + options); warn("ParseException in OrcStorage", PigWarning.UDF_WARNING_1); HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("OrcStorage(',', '[options]')", validOptions); throw new RuntimeException(e); } } @Override
@Override public org.apache.hadoop.hive.ql.io.orc.CompressionKind getCompression() { for (CompressionKind value: org.apache.hadoop.hive.ql.io.orc.CompressionKind.values()) { if (value.getUnderlying() == compressionKind) { return value; } } throw new IllegalArgumentException("Unknown compression kind " + compressionKind); }
MetaInfoObjExtractor(String codecStr, int bufferSize, int metadataSize, ByteBuffer footerBuffer) throws IOException { this.compressionKind = CompressionKind.valueOf(codecStr); this.bufferSize = bufferSize; this.codec = WriterImpl.createCodec(compressionKind); this.metadataSize = metadataSize; int position = footerBuffer.position(); int footerBufferSize = footerBuffer.limit() - footerBuffer.position() - metadataSize; footerBuffer.limit(position + metadataSize); InputStream instream = InStream.create("metadata", Lists.<DiskRange>newArrayList( new BufferChunk(footerBuffer, 0)), metadataSize, codec, bufferSize); this.metadata = OrcProto.Metadata.parseFrom(instream); footerBuffer.position(position + metadataSize); footerBuffer.limit(position + metadataSize + footerBufferSize); instream = InStream.create("footer", Lists.<DiskRange>newArrayList( new BufferChunk(footerBuffer, 0)), footerBufferSize, codec, bufferSize); this.footer = OrcProto.Footer.parseFrom(instream); footerBuffer.position(position); this.inspector = OrcStruct.createObjectInspector(0, footer.getTypesList()); } }
final long stripeSize = context.getProperty(STRIPE_SIZE).asDataSize(DataUnit.B).longValue(); final int bufferSize = context.getProperty(BUFFER_SIZE).asDataSize(DataUnit.B).intValue(); final CompressionKind compressionType = CompressionKind.valueOf(context.getProperty(COMPRESSION_TYPE).getValue()); final AtomicReference<Schema> hiveAvroSchema = new AtomicReference<>(null); final AtomicInteger totalRecordCount = new AtomicInteger(0);
private List<String> writeSmallOrcFiles(Location baseLocation, int numInputFiles) throws IOException { TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString("struct<key:string>"); ObjectInspector objectInspector = TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(typeInfo); Configuration hConf = new Configuration(); FileSystem fileSystem = FileSystem.get(hConf); long stripeSize = HiveConf.getLongVar(hConf, HiveConf.ConfVars.HIVE_ORC_DEFAULT_STRIPE_SIZE); CompressionKind compressionKind = CompressionKind.valueOf(HiveConf.getVar(hConf, HiveConf.ConfVars.HIVE_ORC_DEFAULT_COMPRESS)); int bufferSize = HiveConf.getIntVar(hConf, HiveConf.ConfVars.HIVE_ORC_DEFAULT_BUFFER_SIZE); int rowIndexStride = HiveConf.getIntVar(hConf, HiveConf.ConfVars.HIVE_ORC_DEFAULT_ROW_INDEX_STRIDE); List<String> writtenData = new ArrayList<>(); for (int i = 0; i < numInputFiles; i++) { Location childFile = baseLocation.append("child_" + i); Writer orcWriter = OrcFile.createWriter(fileSystem, new Path(childFile.toURI()), hConf, objectInspector, stripeSize, compressionKind, bufferSize, rowIndexStride); try { String toWrite = "outputData" + i; orcWriter.addRow(Collections.singletonList(toWrite)); writtenData.add(toWrite); } finally { orcWriter.close(); } } Collections.sort(writtenData); return writtenData; }
bufferSizeValue = HiveConf.getIntVar(conf, HIVE_ORC_DEFAULT_BUFFER_SIZE); blockPaddingValue = HiveConf.getBoolVar(conf, HIVE_ORC_DEFAULT_BLOCK_PADDING); compressValue = CompressionKind.valueOf(HiveConf.getVar(conf, HIVE_ORC_DEFAULT_COMPRESS)); String versionName = HiveConf.getVar(conf, HIVE_ORC_WRITE_FORMAT); if (versionName == null) {
options.compress(CompressionKind.valueOf(propVal));