@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); vectorCount = conf.getLong(TFIDFConverter.VECTOR_COUNT, 1); featureCount = conf.getLong(TFIDFConverter.FEATURE_COUNT, 1); minDf = conf.getInt(TFIDFConverter.MIN_DF, 1); maxDf = conf.getLong(TFIDFConverter.MAX_DF, -1); sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false); namedVector = conf.getBoolean(PartialVectorMerger.NAMED_VECTOR, false); URI[] localFiles = DistributedCache.getCacheFiles(conf); Path dictionaryFile = HadoopUtil.findInCacheByPartOfFilename(TFIDFConverter.FREQUENCY_FILE, localFiles); // key is feature, value is the document frequency for (Pair<IntWritable,LongWritable> record : new SequenceFileIterable<IntWritable,LongWritable>(dictionaryFile, true, conf)) { dictionary.put(record.getFirst().get(), record.getSecond().get()); } }
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); vectorCount = conf.getLong(TFIDFConverter.VECTOR_COUNT, 1); featureCount = conf.getLong(TFIDFConverter.FEATURE_COUNT, 1); minDf = conf.getInt(TFIDFConverter.MIN_DF, 1); maxDf = conf.getLong(TFIDFConverter.MAX_DF, -1); sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false); namedVector = conf.getBoolean(PartialVectorMerger.NAMED_VECTOR, false); URI[] localFiles = DistributedCache.getCacheFiles(conf); Path dictionaryFile = HadoopUtil.findInCacheByPartOfFilename(TFIDFConverter.FREQUENCY_FILE, localFiles); // key is feature, value is the document frequency for (Pair<IntWritable,LongWritable> record : new SequenceFileIterable<IntWritable,LongWritable>(dictionaryFile, true, conf)) { dictionary.put(record.getFirst().get(), record.getSecond().get()); } }
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); dimension = conf.getInt(PartialVectorMerger.DIMENSION, Integer.MAX_VALUE); sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false); namedVector = conf.getBoolean(PartialVectorMerger.NAMED_VECTOR, false); maxNGramSize = conf.getInt(DictionaryVectorizer.MAX_NGRAMS, maxNGramSize); URI[] localFiles = DistributedCache.getCacheFiles(conf); Path dictionaryFile = HadoopUtil.findInCacheByPartOfFilename(DictionaryVectorizer.DICTIONARY_FILE, localFiles); // key is word value is id for (Pair<Writable, IntWritable> record : new SequenceFileIterable<Writable, IntWritable>(dictionaryFile, true, conf)) { dictionary.put(record.getFirst().toString(), record.getSecond().get()); } }
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); dimension = conf.getInt(PartialVectorMerger.DIMENSION, Integer.MAX_VALUE); sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false); namedVector = conf.getBoolean(PartialVectorMerger.NAMED_VECTOR, false); maxNGramSize = conf.getInt(DictionaryVectorizer.MAX_NGRAMS, maxNGramSize); URI[] localFiles = DistributedCache.getCacheFiles(conf); Path dictionaryFile = HadoopUtil.findInCacheByPartOfFilename(DictionaryVectorizer.DICTIONARY_FILE, localFiles); // key is word value is id for (Pair<Writable, IntWritable> record : new SequenceFileIterable<Writable, IntWritable>(dictionaryFile, true, conf)) { dictionary.put(record.getFirst().toString(), record.getSecond().get()); } }
@Test public void nonExistingFile() { Path path = HadoopUtil.findInCacheByPartOfFilename("no such file", DISTRIBUTED_CACHE_FILES); assertNull(path); }
@Test public void existingFile() { Path path = HadoopUtil.findInCacheByPartOfFilename("want_to_find", DISTRIBUTED_CACHE_FILES); assertNotNull(path); assertEquals(FILE_I_WANT_TO_FIND.getName(), path.getName()); }