/** * This method returns the number of features. This implementation assumes the * lexicon is populated, but that's not always the case (with SVM for example appears * to not always have a populated lexicon). In these cases, this method may be overriden. * @return the number of featues. */ protected int getNumberFeatures() { return lexicon.size(); }
/** * Returns the size of the lexicon after any pruning that may have taken place or 0 if the * lexicon's location isn't known. **/ public int getPrunedLexiconSize() { if ((lexicon == null || lexicon.size() == 0) && readLexiconOnDemand) { ExceptionlessInputStream in = ExceptionlessInputStream.openCompressedStream(lexFilePath); int result = Lexicon.readPrunedSize(in); in.close(); return result; } return lexicon == null ? 0 : lexicon.getCutoff(); }
/** * Returns this learner's feature lexicon after discarding any feature counts it may have been * storing. This method is likely only useful when the lexicon and its counts are currently * stored on disk and {@link #readLexiconOnDemand(String)} or {@link #readLexiconOnDemand(URL)} * has already been called, in which case the lexicon is read from disk without wasting time * loading the counts. **/ public Lexicon getLexiconDiscardCounts() { if (readLexiconOnDemand && (lexicon == null || lexicon.size() == 0)) lexicon = Lexicon.readLexicon(lexFilePath, false); else lexicon.countFeatures(Lexicon.CountPolicy.none); return lexicon; }
/** * This function gets the feature from the lexicon using the id. Use this function if you use * either getFeatureId or convert to avoid off-by-one errors. */ public static edu.illinois.cs.cogcomp.lbjava.classify.Feature getFeature(Lexicon lexicon, int id) { return lexicon.lookupKey(id - 1); }
if (labelLexicon.size() > 2 || solverType.equals("MCSVM_CS")) { newLabelLexicon = new Lexicon(); boolean same = true; for (int i = 0; i < allExamples.size(); i++) { Feature label = labelLexicon.lookupKey(allLabels.get(i)); int newLabel = newLabelLexicon.lookup(label, true); same &= newLabel == allLabels.get(i); allLabels.set(i, newLabel); if (same && newLabelLexicon.size() == labelLexicon.size()) newLabelLexicon = labelLexicon; else if (newLabelLexicon.size() > labelLexicon.size()) { System.err .println("LBJava ERROR: SupportVectorMachine: new label lexicon is too big!"); System.exit(1); } else { int N = newLabelLexicon.size(); predictions = new FVector(N); for (int i = 0; i < N; ++i) numClasses = newLabelLexicon.size(); for (int i = 0; i < numClasses && !conjunctiveLabels; ++i) conjunctiveLabels = newLabelLexicon.lookupKey(i).isConjunctive(); new DiscretePrimitiveStringFeature(labeler.containingPackage, labeler.name, "", allowableValues[1], (short) 1, (short) 2); int p = newLabelLexicon.lookup(f); int positive = 0;
/** * @see edu.illinois.cs.cogcomp.lbjava.learn.featurepruning.LexiconOptimizer#identifyUselessFeatures() */ @Override protected int[] identifyUselessFeatures() { // compile the whitelist HashSet<Feature> whitelist = compileWhitelist(lexicon); // look at each feature in the lexicon, any with zero weights can be safely discarded. int [] all = new int [this.lexicon.size()]; int count = 0; for (Object e : lexicon.getMap().entrySet()) { @SuppressWarnings("unchecked") Entry<Feature, Integer> entry = (Entry<Feature, Integer>) e; if (!whitelist.contains(entry.getKey())) { int fi = entry.getValue(); double wt = getWeight(fi); if (wt < this.threshold) { all[count] = fi; count++; } } } int[] useless = new int[count]; System.arraycopy(all, 0, useless, 0, count); Arrays.sort(useless); return useless; }
(short) labeler.allowableValues().length); if (labelLexicon.contains(f)) { int key = labelLexicon.lookup(f); score = ((BiasedRandomWeightVector) network.get(key)).dot(exampleFeatures, ((BiasedRandomWeightVector) network.get(l)).dot(exampleFeatures, exampleValues); result.put(labelLexicon.lookupKey(l).getStringValue(), score);
return (Object[]) example; if (readLexiconOnDemand && (lexicon == null || lexicon.size() == 0)) { readLexicon(lexFilePath); readLexiconOnDemand = false; Lexicon.CountPolicy countPolicy = lexicon.getCountPolicy(); int labelIndex = -1; Feature label = labelVector.getFeature(f); if (label.isDiscrete()) labelArray[f] = labelLexicon.lookup(label, true); else labelArray[f] = labelLexicon.lookup(label.getFeatureKey(labelLexicon), true); labelValues[f] += label.getStrength(); createPrediction(labelArray[f]); Feature feature = featureVector.getFeature(f); exampleArrayFeatures[f] = lexicon.lookup(feature.getFeatureKey(lexicon, training, labelIndex), training, labelIndex); exampleArrayValues[f] += feature.getStrength();
/** * Sets the value of {@link #parentLexicon} and makes sure that any features marked for removal * in this lexicon are the identical objects also present in the parent. This is useful in * particular just after lexicons have been read from disk. * * @param p The new parent lexicon. **/ public void setParent(Lexicon p) { parentLexicon = p; int N = lexiconInv.size(); for (int i = 0; i < N; ++i) { Feature f = lexiconInv.get(i); if (f != null && parents.get(i) < 0) { Feature pf = p.lookupKey(p.lookup(f)); if (pf == null) { System.err.println("LBJava ERROR: Can't find feature " + f + " in parent lexicon."); new Exception().printStackTrace(); System.exit(1); } lexiconInv.set(i, pf); if (lexicon != null) lexicon.put(pf, lexicon.remove(f)); } } }
Lexicon lexicon = preExtractLearner.getLexicon(); if (!policy.isNone() && lexicon.getCountPolicy() == Lexicon.CountPolicy.none) throw new IllegalArgumentException( "LBJava ERROR: BatchTrainer.pruneDataset: Can't prune with policy '" + policy afp.setIncludePruned(true); int[] swapMap = lexicon.prune(policy); && lexicon.isPruned(featureIndexes[unpruned - 1], labelIndexes[0], policy)) --unpruned; if (lexicon.isPruned(featureIndexes[i], labelIndexes[0], policy)) { int t = featureIndexes[i]; featureIndexes[i] = featureIndexes[--unpruned]; lexiconSize = lexicon.getCutoff(); preExtractLearner.saveLexicon();
+ "training with per class feature counts."); lazyMapCreation(); Integer I = (Integer) lexicon.get(f); return getCutoff(); lexicon.put(f, new Integer(key)); lexiconInv.add(f); incrementCount(key, label); return key; incrementCount(index, label); return index;
/** * <!-- lookup(Feature,boolean) --> Looks up a feature's index by calling * <code>lookup(f, training, * -1)</code>. See {@link #lookup(Feature,boolean,int)} for more details. * * @param f The feature to look up. * @param training Whether or not the learner is currently training. * @return The integer key that the feature maps to. **/ public int lookup(Feature f, boolean training) { return lookup(f, training, -1); }
/** * Sets the labeler. * * @param l A labeling classifier. **/ public void setLabeler(Classifier l) { if (l == null || l.allowableValues().length != 2) { System.err.println("Error: " + name + ": An LTU must be given a single binary label classifier."); new Exception().printStackTrace(); System.exit(1); } super.setLabeler(l); allowableValues = l.allowableValues(); labelLexicon.clear(); labelLexicon.lookup(new DiscretePrimitiveStringFeature(l.containingPackage, l.name, "", allowableValues[0], (short) 0, (short) 2), true); labelLexicon.lookup(new DiscretePrimitiveStringFeature(l.containingPackage, l.name, "", allowableValues[1], (short) 1, (short) 2), true); createPrediction(0); createPrediction(1); }
/** * This function gets the feature from the lexicon using the id. Use this function if you use * either getFeatureId or convert to avoid off-by-one errors. */ public static edu.illinois.cs.cogcomp.lbjava.classify.Feature getFeature(Lexicon lexicon, int id) { return lexicon.lookupKey(id - 1); }