/** * Set target distributions using "Schapire" heuristic described in * "Learning from Labeled Features using Generalized Expectation Criteria" * Gregory Druck, Gideon Mann, Andrew McCallum. * * @param labeledFeatures HashMap of feature indices to lists of label indices for that feature. * @param numLabels Total number of labels. * @param majorityProb Probability mass divided among majority labels. * @return Constraints (map of feature index to target distribution), with target * distributions set using heuristic. */ public static HashMap<Integer,double[]> setTargetsUsingHeuristic(HashMap<Integer,ArrayList<Integer>> labeledFeatures, int numLabels, double majorityProb) { HashMap<Integer,double[]> constraints = new HashMap<Integer,double[]>(); Iterator<Integer> keyIter = labeledFeatures.keySet().iterator(); while (keyIter.hasNext()) { int fi = keyIter.next(); ArrayList<Integer> labels = labeledFeatures.get(fi); constraints.put(fi, getHeuristicPrior(labels,numLabels,majorityProb)); } return constraints; }
public static HashMap<Integer, ArrayList<Integer>> labelFeatures(InstanceList list, ArrayList<Integer> features) { return labelFeatures(list,features,true); }
/** * Reads feature constraints from a file, whether they are stored * using Strings or indices. * * @param filename File with feature constraints. * @param data InstanceList used for alphabets. * @return Constraints. */ public static HashMap<Integer,double[]> readConstraintsFromFile(String filename, InstanceList data) { if (testConstraintsFileIndexBased(filename)) { return readConstraintsFromFileIndex(filename,data); } return readConstraintsFromFileString(filename,data); }
features = FeatureConstraintUtil.selectFeaturesByInfoGain(list,numConstraints.value); ObjectInputStream ois = new ObjectInputStream(new FileInputStream(ldaFile.value)); ParallelTopicModel lda = (ParallelTopicModel)ois.readObject(); features = FeatureConstraintUtil.selectTopLDAFeatures(numConstraints.value, lda, list.getDataAlphabet()); constraints = FeatureConstraintUtil.setTargetsUsingData(list, features); featuresAndLabels = FeatureConstraintUtil.labelFeatures(list,features); constraints = FeatureConstraintUtil.setTargetsUsingHeuristic(featuresAndLabels,list.getTargetAlphabet().size(),majorityProb.value); constraints = FeatureConstraintUtil.setTargetsUsingFeatureVoting(featuresAndLabels,list);
HashMap<Integer,ArrayList<Integer>> labeledFeatures = new HashMap<Integer,ArrayList<Integer>>(); double[][] featureLabelCounts = getFeatureLabelCounts(list,true); MatrixOps.plusEquals(prob,1e-8); MatrixOps.timesEquals(prob, 1./MatrixOps.sum(prob)); int[] sortedIndices = getMaxIndices(prob); ArrayList<Integer> labels = new ArrayList<Integer>();
FeatureConstraintUtil.readConstraintsFromFile(constraintsFile, trainingList);
FeatureConstraintUtil.setTargetsUsingHeuristic(featuresAndLabels, numLabels, 0.9);
/** * Set target distributions using estimates from data. * * @param list InstanceList used to estimate targets. * @param features List of features for constraints. * @param normalize Whether to normalize by feature counts * @return Constraints (map of feature index to target), with targets * set using estimates from supplied data. */ public static HashMap<Integer,double[]> setTargetsUsingData(InstanceList list, ArrayList<Integer> features, boolean useValues, boolean normalize) { HashMap<Integer,double[]> constraints = new HashMap<Integer,double[]>(); double[][] featureLabelCounts = getFeatureLabelCounts(list,useValues); for (int i = 0; i < features.size(); i++) { int fi = features.get(i); if (fi != list.getDataAlphabet().size()) { double[] prob = featureLabelCounts[fi]; if (normalize) { // Smooth probability distributions by adding a (very) // small count. We just need to make sure they aren't // zero in which case the KL-divergence is infinite. MatrixOps.plusEquals(prob, 1e-8); MatrixOps.timesEquals(prob, 1./MatrixOps.sum(prob)); } constraints.put(fi, prob); } } return constraints; }
FeatureConstraintUtil.readRangeConstraintsFromFile(constraintsFile, trainingList);
labelByVoting(labeledFeatures,instance,labelDist); } else { int li = labeling.getBestIndex();
features = FeatureConstraintUtil.selectFeaturesByInfoGain(list,numConstraints.value); ObjectInputStream ois = new ObjectInputStream(new FileInputStream(ldaFile.value)); ParallelTopicModel lda = (ParallelTopicModel)ois.readObject(); features = FeatureConstraintUtil.selectTopLDAFeatures(numConstraints.value, lda, list.getDataAlphabet()); constraints = FeatureConstraintUtil.setTargetsUsingData(list, features); featuresAndLabels = FeatureConstraintUtil.labelFeatures(list,features); constraints = FeatureConstraintUtil.setTargetsUsingHeuristic(featuresAndLabels,list.getTargetAlphabet().size(),majorityProb.value); constraints = FeatureConstraintUtil.setTargetsUsingFeatureVoting(featuresAndLabels,list);
HashMap<Integer,ArrayList<Integer>> labeledFeatures = new HashMap<Integer,ArrayList<Integer>>(); double[][] featureLabelCounts = getFeatureLabelCounts(list,true); MatrixOps.plusEquals(prob,1e-8); MatrixOps.timesEquals(prob, 1./MatrixOps.sum(prob)); int[] sortedIndices = getMaxIndices(prob); ArrayList<Integer> labels = new ArrayList<Integer>();
FeatureConstraintUtil.readConstraintsFromFile(constraintsFile, trainingList);
FeatureConstraintUtil.setTargetsUsingHeuristic(featuresAndLabels, numLabels, 0.9);
/** * Set target distributions using estimates from data. * * @param list InstanceList used to estimate targets. * @param features List of features for constraints. * @param normalize Whether to normalize by feature counts * @return Constraints (map of feature index to target), with targets * set using estimates from supplied data. */ public static HashMap<Integer,double[]> setTargetsUsingData(InstanceList list, ArrayList<Integer> features, boolean useValues, boolean normalize) { HashMap<Integer,double[]> constraints = new HashMap<Integer,double[]>(); double[][] featureLabelCounts = getFeatureLabelCounts(list,useValues); for (int i = 0; i < features.size(); i++) { int fi = features.get(i); if (fi != list.getDataAlphabet().size()) { double[] prob = featureLabelCounts[fi]; if (normalize) { // Smooth probability distributions by adding a (very) // small count. We just need to make sure they aren't // zero in which case the KL-divergence is infinite. MatrixOps.plusEquals(prob, 1e-8); MatrixOps.timesEquals(prob, 1./MatrixOps.sum(prob)); } constraints.put(fi, prob); } } return constraints; }
FeatureConstraintUtil.readRangeConstraintsFromFile(constraintsFile, trainingList);
labelByVoting(labeledFeatures,instance,labelDist); } else { int li = labeling.getBestIndex();
features = FeatureConstraintUtil.selectFeaturesByInfoGain(list,numConstraints.value); ObjectInputStream ois = new ObjectInputStream(new FileInputStream(ldaFile.value)); ParallelTopicModel lda = (ParallelTopicModel)ois.readObject(); features = FeatureConstraintUtil.selectTopLDAFeatures(numConstraints.value, lda, list.getDataAlphabet()); constraints = FeatureConstraintUtil.setTargetsUsingData(list, features); featuresAndLabels = FeatureConstraintUtil.labelFeatures(list,features); constraints = FeatureConstraintUtil.setTargetsUsingHeuristic(featuresAndLabels,list.getTargetAlphabet().size(),majorityProb.value); constraints = FeatureConstraintUtil.setTargetsUsingFeatureVoting(featuresAndLabels,list);
/** * Reads feature constraints from a file, whether they are stored * using Strings or indices. * * @param filename File with feature constraints. * @param data InstanceList used for alphabets. * @return Constraints. */ public static HashMap<Integer,double[]> readConstraintsFromFile(String filename, InstanceList data) { if (testConstraintsFileIndexBased(filename)) { return readConstraintsFromFileIndex(filename,data); } return readConstraintsFromFileString(filename,data); }
HashMap<Integer,ArrayList<Integer>> labeledFeatures = new HashMap<Integer,ArrayList<Integer>>(); double[][] featureLabelCounts = getFeatureLabelCounts(list,true); MatrixOps.plusEquals(prob,1e-8); MatrixOps.timesEquals(prob, 1./MatrixOps.sum(prob)); int[] sortedIndices = getMaxIndices(prob); ArrayList<Integer> labels = new ArrayList<Integer>();