public InstanceList pipeInstances (PipeInputIterator source) { // I think that pipes should be associated neither with InstanceLists, nor // with Instances. -cas InstanceList toked = new InstanceList (tokenizationPipe); toked.add (source); InstanceList piped = new InstanceList (getFeaturePipe ()); piped.add (new InstanceListIterator (toked)); return piped; }
public InstanceList (Random r, int vocabSize, int numClasses) { this (r, new Dirichlet(dictOfSize(vocabSize), 2.0), 30, 0, 10, 20, classNamesOfSize(numClasses)); }
if (weights.length != size()) throw new IllegalArgumentException("length of weight vector must equal number of instances"); if (size() == 0) return cloneEmpty(); for (int i = 0; i < size(); i++) { if (weights[i] < 0) throw new IllegalArgumentException("weight vector must be non-negative"); throw new IllegalArgumentException("weights must sum to positive value"); InstanceList newList = new InstanceList(); double[] probabilities = new double[size()]; double sumProbs = 0; for (int i = 0; i < size(); i++) { sumProbs += r.nextDouble(); probabilities[i] = sumProbs; probabilities[size() - 1] = sumOfWeights; while (a < size() && b < size()) { sumProbs += weights[b]; while (a < size() && probabilities[a] <= sumProbs) { newList.add(getInstance(b)); newList.setInstanceWeight(a, 1); a++;
public InstanceList getInstances() { InstanceList ret = new InstanceList(m_ilist.getPipe()); for (int ii = 0; ii < m_instIndices.length; ii++) ret.add(m_ilist.getInstance(m_instIndices[ii])); return ret; }
/** Returns a pair of new lists such that the first list in the pair contains * every <code>m</code>th element of this list, starting with the first. * The second list contains all remaining elements. */ public InstanceList[] splitByModulo (int m) { InstanceList[] ret = new InstanceList[2]; ret[0] = this.cloneEmpty(); ret[1] = this.cloneEmpty(); for (int i = 0; i < this.size(); i++) { if (i % m == 0) ret[0].instances.add (this.getInstance(i)); else ret[1].instances.add (this.getInstance(i)); } return ret; }
public InstanceList sampleWithReplacement (java.util.Random r, int numSamples) { InstanceList ret = this.cloneEmpty(); for (int i = 0; i < numSamples; i++) ret.instances.add (this.getInstance(r.nextInt(instances.size()))); return ret; }
this.globalFeatureSelection = trainingData.getFeatureSelection(); if (this.globalFeatureSelection == null) { this.globalFeatureSelection = new FeatureSelection (trainingData.getDataAlphabet()); trainingData.setFeatureSelection (this.globalFeatureSelection); if (validationData != null) validationData.setFeatureSelection (this.globalFeatureSelection); if (testingData != null) testingData.setFeatureSelection (this.globalFeatureSelection); if (trainingProportions != null && featureInductionIteration < trainingProportions.length) { System.out.println ("Training on "+trainingProportions[featureInductionIteration]+"% of the data this round."); InstanceList[] sampledTrainingData = trainingData.split (new Random(1), new double[] {trainingProportions[featureInductionIteration], 1-trainingProportions[featureInductionIteration]}); theTrainingData = sampledTrainingData[0]; theTrainingData.setFeatureSelection (this.globalFeatureSelection); // xxx necessary? System.out.println (" which is "+theTrainingData.size()+" instances"); InstanceList errorInstances = new InstanceList (trainingData.getDataAlphabet(), trainingData.getTargetAlphabet()); errorInstances.setFeatureSelection (this.globalFeatureSelection); ArrayList errorLabelVectors = new ArrayList(); InstanceList clusteredErrorInstances[][] = new InstanceList[numLabels][numLabels]; clusteredErrorInstances[i][j] = new InstanceList (trainingData.getDataAlphabet(), trainingData.getTargetAlphabet()); clusteredErrorInstances[i][j].setFeatureSelection (this.globalFeatureSelection); clusteredErrorLabelVectors[i][j] = new ArrayList();
FeatureSelection selectedFeatures = trainingList.getFeatureSelection(); if (selectedFeatures != null) throw new UnsupportedOperationException("FeatureSelection not yet implemented."); double w = 1.0 / trainingList.size(); InstanceList trainingInsts = new InstanceList(); for (int i = 0; i < trainingList.size(); i++) trainingInsts.add(trainingList.getInstance(i), w); boolean[] correct = new boolean[trainingInsts.size()]; int numClasses = trainingInsts.getTargetAlphabet().size(); if (numClasses != 2) logger.info("AdaBoostTrainer.train: WARNING: more than two classes"); Classifier[] weakLearners = new Classifier[numRounds]; double[] alphas = new double[numRounds]; InstanceList roundTrainingInsts = new InstanceList(); do { err = 0; roundTrainingInsts = trainingInsts.sampleWithInstanceWeights(random); weakLearners[round] = weakLearner.train (roundTrainingInsts, validationList); for (int i = 0; i < trainingInsts.size(); i++) { Instance inst = trainingInsts.getInstance(i); if (weakLearners[round].classify(inst).bestLabelIsCorrect()) correct[i] = true; else { correct[i] = false; err += trainingInsts.getInstanceWeight(i);
this.beam1 = beam1; this.beam2 = beam2; if (ilist.size() < minTrainingListSize) { logger.info ("FeatureInducer not inducing from less than "+minTrainingListSize+" features."); return; Alphabet tmpDV = (Alphabet) ilist.getDataAlphabet().clone(); FeatureSelection featuresSelected = ilist.getFeatureSelection(); InstanceList tmpilist = new InstanceList (tmpDV, ilist.getTargetAlphabet()); RankedFeatureVector gg = ranker.newRankedFeatureVector (ilist); logger.info ("Rank values before this round of conjunction-building"); for (int i = 0; i < ilist.size(); i++) { Instance inst = ilist.getInstance(i); FeatureVector fv = (FeatureVector) inst.getData (); tmpilist.add (new Instance (new FeatureVector (fv, tmpDV, fsMin, fsMax), inst.getTarget(), inst.getName(), inst.getSource(), null), ilist.getInstanceWeight(i)); Alphabet origV = ilist.getDataAlphabet(); int origVSize = origV.size(); nextfeatures:
FeatureSelection selectedFeatures = trainingList.getFeatureSelection(); if (selectedFeatures != null) throw new UnsupportedOperationException("FeatureSelection not yet implemented."); int numClasses = trainingList.getTargetAlphabet().size(); int numInstances = trainingList.size(); InstanceList trainingInsts = new InstanceList(); int numAdded = 0; for (int i = 0; i < numInstances; i++) { Instance inst = trainingList.getInstance(i); int trueClassIndex = inst.getLabeling().getBestIndex(); for (int j = 0; j < numClasses; j++) { if (j != trueClassIndex) { trainingInsts.add(inst, 1); classIndices[numAdded] = j; numAdded++; InstanceList roundTrainingInsts = new InstanceList(); int resamplingIterations = 0; do { epsilon = 0; roundTrainingInsts = new InstanceList(); int[] sampleIndices = sampleWithWeights(instIndices, weights, random); for (int i = 0; i < sampleIndices.length; i++) { Instance inst = trainingInsts.getInstance(sampleIndices[i]); roundTrainingInsts.add(inst, 1);
private static double[] calcFeatureCounts (InstanceList ilist) { int numInstances = ilist.size(); int numClasses = ilist.getTargetAlphabet().size(); int numFeatures = ilist.getDataAlphabet().size(); double[] counts = new double[numFeatures]; double count; for (int i = 0; i < ilist.size(); i++) { Instance inst = ilist.getInstance(i); if (!(inst.getData() instanceof FeatureVector)) throw new IllegalArgumentException ("Currently only handles FeatureVector data"); FeatureVector fv = (FeatureVector) inst.getData (); if (ilist.getInstanceWeight(i) == 0) continue; for (int j = 0; j < fv.numLocations(); j++) { if (countInstances) counts[fv.indexAtLocation(j)] += 1; else counts[fv.indexAtLocation(j)] += fv.valueAtLocation(j); } } return counts; }
private static Classifier trainPairwiseClassifier (ArrayList[] nodes, Pipe p) { InstanceList ilist = new InstanceList (p); for (int i=0; i < nodes.length; i++) ilist.add (CitationUtils.makePairs (p, nodes[i])); MaxEnt me = (MaxEnt)(new MaxEntTrainer().train(ilist, null, null, null, null)); ilist.getDataAlphabet().stopGrowth(); Trial t = new Trial(me, ilist); System.out.println("Pairwise classifier: -> Training F1 on \"yes\" is: " + t.labelF1("yes")); return me; }
public InstanceList shallowClone () { InstanceList ret = new InstanceList (pipe, instances.size()); for (int i = 0; i < instances.size(); i++) ret.add (getInstance(i)); if (instanceWeights == null) ret.instanceWeights = null; else ret.instanceWeights = instanceWeights.cloneDoubleList(); return ret; }
/** * Test a maxent classifier. The data representation is the same as * for training. * * @param classifier the classifier to test * @param data an iterator over labeled instances * @return accuracy on the data */ static public double test(Classifier classifier, PipeInputIterator data) { InstanceList testList = new InstanceList (classifier.getInstancePipe()); testList.add(data); logger.info("# test instances = " + testList.size()); double accuracy = classifier.getAccuracy(testList); return accuracy; }
public MaximizableTrainer (InstanceList ilist, MaxEnt initialClassifier) Alphabet fd = ilist.getDataAlphabet(); LabelAlphabet ld = (LabelAlphabet) ilist.getTargetAlphabet(); Arrays.fill (constraints, 0.0); Arrays.fill (cachedGradient, 0.0); this.featureSelection = ilist.getFeatureSelection(); this.perLabelFeatureSelection = ilist.getPerLabelFeatureSelection(); this.perLabelFeatureSelection = theClassifier.perClassFeatureSelection; this.defaultFeatureIndex = theClassifier.defaultFeatureIndex; assert (initialClassifier.getInstancePipe() == ilist.getPipe()); this.theClassifier = new MaxEnt (ilist.getPipe(), parameters, featureSelection, perLabelFeatureSelection); InstanceList.Iterator iter = trainingList.iterator (); logger.fine("Number of instances in training list = " + trainingList.size()); while (iter.hasNext()) { double instanceWeight = iter.getInstanceWeight();
public Classifier trainPairwiseClassifier (ArrayList[] nodes, Pipe p) { InstanceList ilist = new InstanceList (p); for (int i=0; i < nodes.length; i++) ilist.add (CitationUtils.makePairs (p, nodes[i])); System.err.println ("Training size: " + ilist.size() + "\tNum features: " + ilist.getDataAlphabet().size()); MaxEnt me = (MaxEnt)(new MaxEntTrainer().train(ilist, null, null, null, null)); ilist.getDataAlphabet().stopGrowth(); Trial t = new Trial(me, ilist); System.out.println("Pairwise classifier: -> Training F1 on \"yes\" is: " + t.labelF1("yes")); System.out.println("Pairwise classifier: -> Training F1 on \"no\" is: " + t.labelF1("no")); return me; }
private static InstanceList getNonTrivialTesting (Pipe pipe, InstanceList clusterList) { InstanceList goodClusters = new InstanceList (new Alphabet(), new Alphabet()); for (InstanceList.Iterator it = clusterList.iterator(); it.hasNext();) { Instance instance = (Instance) it.next(); InstanceList cluster = (InstanceList) instance.getData (); if (cluster.size() > 2) { goodClusters.add (instance); } } InstanceList goodInstances = new InstanceList (pipe); goodInstances.add (new ClusterListIterator(goodClusters)); return goodInstances; }
public void testTokenAccuracy () { Pipe p = makeSpacePredictionPipe (); InstanceList instances = new InstanceList(p); instances.add(new ArrayIterator(data)); InstanceList[] lists = instances.split (new Random (777), new double[]{.5, .5}); CRF4 crf = new CRF4(p.getDataAlphabet(), p.getTargetAlphabet()); crf.addFullyConnectedStatesForLabels(); crf.setUseSparseWeights (true); crf.train (lists[0]); TokenAccuracyEvaluator eval = new TokenAccuracyEvaluator (); eval.test (crf, lists[1], "Testing", null); assertEquals (0.9409, eval.getLastAccuracy (), 0.001); } public void testPrint ()
Classifier initialClassifier) FeatureSelection selectedFeatures = trainList.getFeatureSelection(); logger.fine ("Training underlying classifier"); Classifier c = underlyingClassifierTrainer.train (trainList, null, null, null, initialClassifier); InstanceList confidencePredictionTraining = new InstanceList (confidencePredictingPipe); logger.fine ("Creating confidence prediction instance list"); double weight; for (int i = 0; i < t.size(); i++) { Classification classification = t.getClassification(i); confidencePredictionTraining.add (classification, null, classification.getInstance().getName(), classification.getInstance().getSource());
public InstanceList cloneEmpty () { InstanceList ret = new InstanceList (pipe); ret.instanceWeights = instanceWeights == null ? null : (DoubleList) instanceWeights.clone(); // xxx Should the featureSelection and perLabel... be cloned? // Note that RoostingTrainer currently depends on not cloning its splitting. ret.featureSelection = this.featureSelection; ret.perLabelFeatureSelection = this.perLabelFeatureSelection; ret.dataClass = this.dataClass; ret.targetClass = this.targetClass; ret.dataVocab = this.dataVocab; ret.targetVocab = this.targetVocab; return ret; }