PlanUtils.configureInputJobPropertiesForStorageHandler(tableDesc); Utilities.copyTableJobPropertiesToConf(tableDesc, jobConf); long len = estimator.estimate(jobConf, scanOp, threshold).getTotalLength(); if (LOG.isDebugEnabled()) { LOG.debug("Threshold " + len + " exceeded for pseudoMR mode");
PlanUtils.configureInputJobPropertiesForStorageHandler(tableDesc); Utilities.copyTableJobPropertiesToConf(tableDesc, jobConf); total += estimator.estimate(myJobConf, scanOp, -1).getTotalLength();
private long calculateLength(ParseContext pctx, long remaining) throws Exception { JobConf jobConf = new JobConf(pctx.getConf()); Utilities.setColumnNameList(jobConf, scanOp, true); Utilities.setColumnTypeList(jobConf, scanOp, true); HiveStorageHandler handler = table.getStorageHandler(); if (handler instanceof InputEstimator) { InputEstimator estimator = (InputEstimator) handler; TableDesc tableDesc = Utilities.getTableDesc(table); PlanUtils.configureInputJobPropertiesForStorageHandler(tableDesc); Utilities.copyTableJobPropertiesToConf(tableDesc, jobConf); return estimator.estimate(jobConf, scanOp, remaining).getTotalLength(); } if (table.isNonNative()) { return 0; // nothing can be done } if (!table.isPartitioned()) { return getFileLength(jobConf, table.getPath(), table.getInputFormatClass()); } long total = 0; for (Partition partition : partsList.getNotDeniedPartns()) { Path path = partition.getDataLocation(); total += getFileLength(jobConf, path, partition.getInputFormatClass()); } return total; }
@Override public Estimation estimate(JobConf job, TableScanOperator ts, long remaining) throws HiveException { String hiveTableName = ts.getConf().getTableMetadata().getTableName(); int reducerCount = job.getInt(hiveTableName + PhoenixStorageHandlerConstants .PHOENIX_REDUCER_NUMBER, 1); if (LOG.isDebugEnabled()) { LOG.debug("Estimating input size for table: " + hiveTableName + " with reducer count " + reducerCount + ". Remaining : " + remaining); } long bytesPerReducer = job.getLong(HiveConf.ConfVars.BYTESPERREDUCER.varname, Long.parseLong(HiveConf.ConfVars.BYTESPERREDUCER.getDefaultValue())); long totalLength = reducerCount * bytesPerReducer; return new Estimation(0, totalLength); } }
PlanUtils.configureInputJobPropertiesForStorageHandler(tableDesc); Utilities.copyTableJobPropertiesToConf(tableDesc, jobConf); total += estimator.estimate(jobConf, scanOp, -1).getTotalLength();
PlanUtils.configureInputJobPropertiesForStorageHandler(tableDesc); Utilities.copyTableJobPropertiesToConf(tableDesc, jobConf); long len = estimator.estimate(jobConf, scanOp, threshold).getTotalLength(); if (LOG.isDebugEnabled()) { LOG.debug("Threshold " + len + " exceeded for pseudoMR mode");
PlanUtils.configureInputJobPropertiesForStorageHandler(tableDesc); Utilities.copyTableJobPropertiesToConf(tableDesc, jobConf); total += estimator.estimate(jobConf, scanOp, -1).getTotalLength();
@Test public void testGetInputSummaryWithInputEstimator() throws IOException, HiveException { final int NUM_PARTITIONS = 5; final int BYTES_PER_FILE = 10; final int NUM_OF_ROWS = 5; JobConf jobConf = new JobConf(); Properties properties = new Properties(); jobConf.setInt(Utilities.DEPRECATED_MAPRED_DFSCLIENT_PARALLELISM_MAX, 2); properties.setProperty(hive_metastoreConstants.META_TABLE_STORAGE, InputEstimatorTestClass.class.getName()); InputEstimatorTestClass.setEstimation(new InputEstimator.Estimation(NUM_OF_ROWS, BYTES_PER_FILE)); /* Let's write more bytes to the files to test that Estimator is actually working returning the file size not from the filesystem */ ContentSummary summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE * 2, HiveInputFormat.class); assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength()); assertEquals(NUM_PARTITIONS * -1, summary.getFileCount()); // Current getInputSummary() returns -1 for each file found assertEquals(NUM_PARTITIONS * -1, summary.getDirectoryCount()); // Current getInputSummary() returns -1 for each file found // Test deprecated mapred.dfsclient.parallelism.max jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 0); jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 2); properties.setProperty(hive_metastoreConstants.META_TABLE_STORAGE, InputEstimatorTestClass.class.getName()); InputEstimatorTestClass.setEstimation(new InputEstimator.Estimation(NUM_OF_ROWS, BYTES_PER_FILE)); /* Let's write more bytes to the files to test that Estimator is actually working returning the file size not from the filesystem */ summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE * 2, HiveInputFormat.class); assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength()); assertEquals(NUM_PARTITIONS * -1, summary.getFileCount()); // Current getInputSummary() returns -1 for each file found assertEquals(NUM_PARTITIONS * -1, summary.getDirectoryCount()); // Current getInputSummary() returns -1 for each file found }