@Override public void checkOutputSpecs(FileSystem ignored, JobConf job) throws FileAlreadyExistsException, InvalidJobConfException, IOException { String tableName = job.get(OUTPUT_TABLE); if (tableName == null) { throw new IOException("Must specify table name"); } } }
/** * On Tez we're not creating dummy files when getting/setting input paths. * We let Tez handle the situation. We're also setting the paths in the AM * so we don't want to depend on scratch dir and context. */ public static List<Path> getInputPathsTez(JobConf job, MapWork work) throws Exception { String scratchDir = job.get(DagUtils.TEZ_TMP_DIR_KEY); List<Path> paths = getInputPaths(job, work, new Path(scratchDir), null, true); return paths; }
/** * Merge HadoopConfiguration into JobConf. This is necessary for the HDFS configuration. */ public static void mergeHadoopConf(JobConf jobConf) { // we have to load the global configuration here, because the HadoopInputFormatBase does not // have access to a Flink configuration object org.apache.flink.configuration.Configuration flinkConfiguration = GlobalConfiguration.loadConfiguration(); Configuration hadoopConf = getHadoopConfiguration(flinkConfiguration); for (Map.Entry<String, String> e : hadoopConf) { if (jobConf.get(e.getKey()) == null) { jobConf.set(e.getKey(), e.getValue()); } } }
public ParquetRecordWriterWrapper( final OutputFormat<Void, ParquetHiveRecord> realOutputFormat, final JobConf jobConf, final String name, final Progressable progress, Properties tableProperties) throws IOException { try { // create a TaskInputOutputContext TaskAttemptID taskAttemptID = TaskAttemptID.forName(jobConf.get("mapred.task.id")); if (taskAttemptID == null) { taskAttemptID = new TaskAttemptID(); } taskContext = ContextUtil.newTaskAttemptContext(jobConf, taskAttemptID); LOG.info("initialize serde with table properties."); initializeSerProperties(taskContext, tableProperties); LOG.info("creating real writer to write at " + name); realWriter = ((ParquetOutputFormat) realOutputFormat).getRecordWriter(taskContext, new Path(name)); LOG.info("real writer: " + realWriter); } catch (final InterruptedException e) { throw new IOException(e); } }
@Override public void checkOutputSpecs(FileSystem fs, JobConf jc) throws IOException { //obtain delegation tokens for the job if (UserGroupInformation.getCurrentUser().hasKerberosCredentials()) { TableMapReduceUtil.initCredentials(jc); } String hbaseTableName = jc.get(HBaseSerDe.HBASE_TABLE_NAME); jc.set(TableOutputFormat.OUTPUT_TABLE, hbaseTableName); Job job = new Job(jc); JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job); try { checkOutputSpecs(jobContext); } catch (InterruptedException e) { throw new IOException(e); } }
conf.set("cluster.xml", new ClusterMapper().writeCluster(cluster)); conf.set("stores.xml", new StoreDefinitionsMapper().writeStoreList(Collections.singletonList(storeDef))); conf.setBoolean(VoldemortBuildAndPushJob.SAVE_KEYS, saveKeys); conf.setReduceSpeculativeExecution(false); FileInputFormat.setInputPaths(conf, inputPath); conf.set("final.output.dir", outputDir.toString()); conf.set(VoldemortBuildAndPushJob.CHECKSUM_TYPE, CheckSum.toString(checkSumType)); conf.set("dfs.umaskmode", "002"); throw new IOException("Final output directory already exists."); AvroJob.setInputSchema(conf, Schema.parse(baseJobConf.get(AVRO_REC_SCHEMA))); Path directoryPath = new Path(outputDir.toString(), directoryName);
@Override public org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter getHiveRecordWriter(JobConf jobConf, Path path, Class<? extends Writable> valueClass, boolean isCompressed, Properties properties, Progressable progressable) throws IOException { Schema schema; try { schema = AvroSerdeUtils.determineSchemaOrThrowException(jobConf, properties); } catch (AvroSerdeException e) { throw new IOException(e); } GenericDatumWriter<GenericRecord> gdw = new GenericDatumWriter<GenericRecord>(schema); DataFileWriter<GenericRecord> dfw = new DataFileWriter<GenericRecord>(gdw); if (isCompressed) { int level = jobConf.getInt(DEFLATE_LEVEL_KEY, DEFAULT_DEFLATE_LEVEL); String codecName = jobConf.get(OUTPUT_CODEC, DEFLATE_CODEC); CodecFactory factory = codecName.equals(DEFLATE_CODEC) ? CodecFactory.deflateCodec(level) : CodecFactory.fromString(codecName); dfw.setCodec(factory); } dfw.create(schema, path.getFileSystem(jobConf).create(path)); return new AvroGenericRecordWriter(dfw); }
String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING); boolean doColumnRegexMatching = jobConf.getBoolean(HBaseSerDe.HBASE_COLUMNS_REGEX_MATCHING, true); List<Integer> readColIDs = ColumnProjectionUtils.getReadColumnIDs(jobConf); columnMappings = HBaseSerDe.parseColumnsMapping(hbaseColumnsMapping, doColumnRegexMatching); } catch (SerDeException e) { throw new IOException(e); throw new IOException("Cannot read more columns than the given table contains."); String scanCache = jobConf.get(HBaseSerDe.HBASE_SCAN_CACHE); if (scanCache != null) { scan.setCaching(Integer.parseInt(scanCache)); String scanCacheBlocks = jobConf.get(HBaseSerDe.HBASE_SCAN_CACHEBLOCKS); if (scanCacheBlocks != null) { scan.setCacheBlocks(Boolean.parseBoolean(scanCacheBlocks)); String scanBatch = jobConf.get(HBaseSerDe.HBASE_SCAN_BATCH); if (scanBatch != null) { scan.setBatch(Integer.parseInt(scanBatch)); String filterObjectSerialized = jobConf.get(TableScanDesc.FILTER_OBJECT_CONF_STR);
final Progressable progressable) throws IOException { String hbaseTableName = jc.get(HBaseSerDe.HBASE_TABLE_NAME); if (hbaseTableName == null) { hbaseTableName = tableProperties.getProperty(hive_metastoreConstants.META_TABLE_NAME); jc.set(OUTPUT_TABLE_NAME_CONF_KEY, hbaseTableName); final Path columnFamilyPath = new Path(hfilePath); final String columnFamilyName = columnFamilyPath.getName(); final byte [] columnFamilyNameBytes = Bytes.toBytes(columnFamilyName); FileStatus [] files = fs.listStatus(srcDir, FileUtils.STAGING_DIR_PATH_FILTER); if ((files == null) || (files.length == 0)) { throw new IOException("No family directories found in " + srcDir); throw new IOException("Multiple family directories found in " + srcDir); throw new IOException("No family directories found in " + taskAttemptOutputdir + ". " + "The last component in hfile path should match column family name " + columnFamilyName); fs.rename( regionFile.getPath(), new Path( columnFamilyPath, regionFile.getPath().getName()));
throw new IOException("Acid table: " + table.getTableName() + " is missing from the ValidWriteIdList config: " + conf.get(ValidTxnWriteIdList.VALID_TABLES_WRITEIDS_KEY)); throw new IOException(e); dirs.get(0).toString()), ZeroRowsInputFormat.class.getName())); finalDirs.get(0).toString()), ZeroRowsInputFormat.class.getName()));
try { String partitionColumn = job.get(Constants.JDBC_PARTITION_COLUMN); int numPartitions = job.getInt(Constants.JDBC_NUM_PARTITIONS, -1); String lowerBound = job.get(Constants.JDBC_LOW_BOUND); String upperBound = job.get(Constants.JDBC_UPPER_BOUND); List<String> columnNames = dbAccessor.getColumnNames(job); if (!columnNames.contains(partitionColumn)) { throw new IOException("Cannot find partitionColumn:" + partitionColumn + " in " + columnNames); List<TypeInfo> hiveColumnTypesList = TypeInfoUtils.getTypeInfosFromTypeString(job.get(serdeConstants.LIST_COLUMN_TYPES)); TypeInfo typeInfo = hiveColumnTypesList.get(columnNames.indexOf(partitionColumn)); if (!(typeInfo instanceof PrimitiveTypeInfo)) { throw new IOException(partitionColumn + " is a complex type, only primitive type can be a partition column"); throw new IOException("lowerBound of " + partitionColumn + " cannot be null");
LOG.error("Error checking non-combinable path", e); perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS); throw new IOException(e); String oldPaths = job.get(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR); if (LOG.isDebugEnabled()) { LOG.debug("The received input paths are: [" + oldPaths + job.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR, oldPaths);
process.inputClient.configure (TaskType.MAP, job.get(AvroJob.INPUT_SCHEMA), AvroJob.getMapOutputSchema(job).toString()); throw new IOException("Task failed: "+process.outputService.error()); throw new IOException("Task failed: "+t, t);
@Override public void abortJob(JobContext context, int status) throws IOException { JobConf conf = ShimLoader.getHadoopShims().getJobConf(context); Path tmpLocation = new Path(conf.get(TMP_LOCATION)); FileSystem fs = tmpLocation.getFileSystem(conf); LOG.debug("Removing " + tmpLocation.toString()); fs.delete(tmpLocation, true); } }
private void getWriter(Reporter reporter, ObjectInspector inspector, int bucket) throws IOException { if (writer == null) { AcidOutputFormat.Options options = new AcidOutputFormat.Options(jobConf); options.inspector(inspector) .writingBase(jobConf.getBoolean(IS_MAJOR, false)) .isCompressed(jobConf.getBoolean(IS_COMPRESSED, false)) .tableProperties(new StringableMap(jobConf.get(TABLE_PROPS)).toProperties()) .reporter(reporter) .minimumTransactionId(jobConf.getLong(MIN_TXN, Long.MAX_VALUE)) .maximumTransactionId(jobConf.getLong(MAX_TXN, Long.MIN_VALUE)) .bucket(bucket) .statementId(-1);//setting statementId == -1 makes compacted delta files use //delta_xxxx_yyyy format // Instantiate the underlying output format @SuppressWarnings("unchecked")//since there is no way to parametrize instance of Class AcidOutputFormat<WritableComparable, V> aof = instantiate(AcidOutputFormat.class, jobConf.get(OUTPUT_FORMAT_CLASS_NAME)); writer = aof.getRawRecordWriter(new Path(jobConf.get(TMP_LOCATION)), options); } }
conf.set("hadoop.job.ugi", hadoop_ugi); conf.set("mapred.job.tracker", "local"); conf.set("fs.default.name", "file:///"); conf.set("mapred.local.dir", "/tmp/map-red"); info("Setting hadoop jar file for class:" + getClass() + " to " + conf.getJar()); info("*************************************************************************"); info(" Running on Real Hadoop Cluster(" + conf.get("mapred.job.tracker") + ") "); info("*************************************************************************"); FileStatus[] statuses = fs.listStatus(new Path(latestPath), filter); path = statuses[statuses.length - 1].getPath().toString(); System.out.println("Using latest folder: " + path); HadoopUtils.addAllSubPaths(conf, new Path(path)); FileOutputFormat.setOutputPath(conf, new Path(location));
private FileSinkOperator.RecordWriter getHiveWriter() throws IOException { if (this.hiveWriter == null){ Properties properties = new Properties(); for (AvroSerdeUtils.AvroTableProperties tableProperty : AvroSerdeUtils.AvroTableProperties.values()){ String propVal; if((propVal = jobConf.get(tableProperty.getPropName())) != null){ properties.put(tableProperty.getPropName(),propVal); } } Boolean isCompressed = jobConf.getBoolean("mapreduce.output.fileoutputformat.compress", false); Path path = new Path(this.fileName); if(path.getFileSystem(jobConf).isDirectory(path)){ // This path is only potentially encountered during setup // Otherwise, a specific part_xxxx file name is generated and passed in. path = new Path(path,"_dummy"); } this.hiveWriter = getHiveRecordWriter(jobConf,path,null,isCompressed, properties, progressable); } return this.hiveWriter; }
/** * Add a {@link Path} to the list of inputs for the map-reduce job. * * @param conf The configuration of the job * @param path {@link Path} to be added to the list of inputs for * the map-reduce job. */ public static void addInputPath(JobConf conf, Path path ) { path = new Path(conf.getWorkingDirectory(), path); String dirStr = StringUtils.escapeString(path.toString()); String dirs = conf.get("mapred.input.dir"); conf.set("mapred.input.dir", dirs == null ? dirStr : dirs + StringUtils.COMMA_STR + dirStr); }
/** * Retrieves a map of {@link Path}s to the {@link AvroMapper} class that * should be used for them. * * @param conf The configuration of the job * @see #addInputPath(JobConf, Path, Class, Schema) * @return A map of paths-to-mappers for the job */ @SuppressWarnings("unchecked") static Map<Path, Class<? extends AvroMapper>> getMapperTypeMap(JobConf conf) { if (conf.get(mappersKey) == null) { return Collections.emptyMap(); } Map<Path, Class<? extends AvroMapper>> m = new HashMap<>(); String[] pathMappings = conf.get(mappersKey).split(","); for (String pathMapping : pathMappings) { String[] split = pathMapping.split(";"); Class<? extends AvroMapper> mapClass; try { mapClass = (Class<? extends AvroMapper>) conf.getClassByName(split[1]); } catch (ClassNotFoundException e) { throw new RuntimeException(e); } m.put(new Path(split[0]), mapClass); } return m; }