/** * Read the Frequent Patterns generated from Text * * @return List of TopK patterns for each string frequent feature */ public static List<Pair<String,TopKStringPatterns>> readFrequentPattern(Parameters params) throws IOException { Configuration conf = new Configuration(); Path frequentPatternsPath = new Path(params.get(OUTPUT), FREQUENT_PATTERNS); FileSystem fs = FileSystem.get(frequentPatternsPath.toUri(), conf); FileStatus[] outputFiles = fs.globStatus(new Path(frequentPatternsPath, FILE_PATTERN)); List<Pair<String,TopKStringPatterns>> ret = Lists.newArrayList(); for (FileStatus fileStatus : outputFiles) { ret.addAll(FPGrowth.readFrequentPattern(conf, fileStatus.getPath())); } return ret; }
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Parameters params = new Parameters(context.getConfiguration().get(PFPGrowth.PFP_PARAMETERS, "")); for (Pair<String,Long> e : PFPGrowth.readFList(context.getConfiguration())) { featureReverseMap.add(e.getFirst()); freqList.add(e.getSecond()); } maxHeapSize = Integer.valueOf(params.get(PFPGrowth.MAX_HEAP_SIZE, "50")); minSupport = Integer.valueOf(params.get(PFPGrowth.MIN_SUPPORT, "3")); maxPerGroup = params.getInt(PFPGrowth.MAX_PER_GROUP, 0); numFeatures = featureReverseMap.size(); useFP2 = "true".equals(params.get(PFPGrowth.USE_FPG2)); } }
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Parameters params = new Parameters(context.getConfiguration().get(PFPGrowth.PFP_PARAMETERS, "")); splitter = Pattern.compile(params.get(PFPGrowth.SPLIT_PATTERN, PFPGrowth.SPLITTER.toString())); } }
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); int i = 0; for (Pair<String,Long> e : PFPGrowth.readFList(context.getConfiguration())) { fMap.put(e.getFirst(), i++); } Parameters params = new Parameters(context.getConfiguration().get(PFPGrowth.PFP_PARAMETERS, "")); splitter = Pattern.compile(params.get(PFPGrowth.SPLIT_PATTERN, PFPGrowth.SPLITTER.toString())); maxPerGroup = params.getInt(PFPGrowth.MAX_PER_GROUP, 0); } }
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Parameters params = new Parameters(context.getConfiguration().get("pfp.parameters", "")); maxHeapSize = Integer.valueOf(params.get("maxHeapSize", "50")); } }
conf.set("mapred.output.compression.type", "BLOCK"); Path input = new Path(params.get(OUTPUT), FP_GROWTH); Job job = new Job(conf, "PFP Aggregator Driver running over input: " + input); job.setJarByClass(PFPGrowth.class); Path outPath = new Path(params.get(OUTPUT), FREQUENT_PATTERNS); FileOutputFormat.setOutputPath(job, outPath);
conf.set("mapred.output.compression.type", "BLOCK"); String input = params.get(INPUT); Job job = new Job(conf, "Parallel Counting Driver running over input: " + input); job.setJarByClass(PFPGrowth.class); Path outPath = new Path(params.get(OUTPUT), PARALLEL_COUNTING); FileOutputFormat.setOutputPath(job, outPath);
conf.set("mapred.compress.map.output", "true"); conf.set("mapred.output.compression.type", "BLOCK"); Path input = new Path(params.get(INPUT)); Job job = new Job(conf, "PFP Growth Driver running over input" + input); job.setJarByClass(PFPGrowth.class); Path outPath = new Path(params.get(OUTPUT), FP_GROWTH); FileOutputFormat.setOutputPath(job, outPath);
/** * Serializes the fList and returns the string representation of the List */ public static void saveFList(Iterable<Pair<String,Long>> flist, Parameters params, Configuration conf) throws IOException { Path flistPath = new Path(params.get(OUTPUT), F_LIST); FileSystem fs = FileSystem.get(flistPath.toUri(), conf); flistPath = fs.makeQualified(flistPath); HadoopUtil.delete(conf, flistPath); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, flistPath, Text.class, LongWritable.class); try { for (Pair<String,Long> pair : flist) { writer.append(new Text(pair.getFirst()), new LongWritable(pair.getSecond())); } } finally { writer.close(); } DistributedCache.addCacheFile(flistPath.toUri(), conf); }
private static void runFPGrowth(Parameters params) throws IOException { log.info("Starting Sequential FPGrowth"); int maxHeapSize = Integer.valueOf(params.get("maxHeapSize", "50")); int minSupport = Integer.valueOf(params.get("minSupport", "3")); Path output = new Path(params.get("output", "output.txt")); Path input = new Path(params.get("input")); Charset encoding = Charset.forName(params.get("encoding")); String pattern = params.get("splitPattern", PFPGrowth.SPLITTER.toString()); if ("true".equals(params.get(PFPGrowth.USE_FPG2))) { org.apache.mahout.fpm.pfpgrowth.fpgrowth2.FPGrowthObj<String> fp = new org.apache.mahout.fpm.pfpgrowth.fpgrowth2.FPGrowthObj<String>();
if((params.get("resume")!=null)&&(params.get("keepFiles")!=null)){ System.out.println("-(r)esume & -(k)eepFiles are mutually exclusive options"); System.out.println("Exiting..."); if((params.get("input")!=null)&&(params.get("resume")!=null)){ System.out.println("-(r)esume & -(i)nput are mutually exclusive options"); System.out.println("Exiting..."); if((params.get("input")==null)&&(params.get("resume")==null)){ System.out.println("At least one option from -(i)nput or -(r)esume must be specified"); System.out.println("Exiting..."); if(params.get("input")!=null){ inputDir = new Path(params.get("input")); inputDir = new Path(params.get("resume")); if(params.get("resume")!=null) commonConfig.setResumeOption(true); else commonConfig.setResumeOption(false); if(params.get("keepFiles")!=null){ commonConfig.setKeepFilesOption(true); Path intermediateDir = new Path(params.get("keepFiles")); if(fs.exists(intermediateDir)){ fs.delete(intermediateDir, true); commonConfig.setIntermediatePath(params.get("keepFiles"));