public static Dictionary fromList(Iterable<String> values) { Dictionary dict = new Dictionary(); for (String value : values) { dict.intern(value); } return dict; } }
/*** * Extract the corresponding raw target label according to a code * @param code the integer code encoded during training process * @return the raw target label */ public String getTargetLabel(int code) { for (String key : targetDictionary.values()) { if (targetDictionary.intern(key) == code) { return key; } } return null; }
@Test public void testDictionaryOrder() { Dictionary dict = new Dictionary(); dict.intern("a"); dict.intern("d"); dict.intern("c"); dict.intern("b"); dict.intern("qrz"); assertEquals("[a, d, c, b, qrz]", dict.values().toString()); Dictionary dict2 = Dictionary.fromList(dict.values()); assertEquals("[a, d, c, b, qrz]", dict2.values().toString()); } }
/** * Defines the values and thus the encoding of values of the target variables. Note * that any values of the target variable not present in this list will be given the * value of the last member of the list. * @param values The values the target variable can have. */ @Override public void defineTargetCategories(List<String> values) { Preconditions.checkArgument( values.size() <= maxTargetValue, "Must have less than or equal to " + maxTargetValue + " categories for target variable, but found " + values.size()); if (maxTargetValue == Integer.MAX_VALUE) { maxTargetValue = values.size(); } for (String value : values) { targetDictionary.intern(value); } }
@Override public List<String> getTargetCategories() { List<String> r = targetDictionary.values(); if (r.size() > maxTargetValue) { r.subList(maxTargetValue, r.size()).clear(); } return r; }
/** * Construct a parser for CSV lines that encodes the parsed data in vector form. * @param targetName The name of the target variable. * @param typeMap A map describing the types of the predictor variables. */ public CsvRecordFactory(String targetName, Map<String, String> typeMap) { this.targetName = targetName; this.typeMap = typeMap; targetDictionary = new Dictionary(); }
/** * Defines the values and thus the encoding of values of the target variables. Note * that any values of the target variable not present in this list will be given the * value of the last member of the list. * @param values The values the target variable can have. */ @Override public void defineTargetCategories(List<String> values) { Preconditions.checkArgument( values.size() <= maxTargetValue, "Must have less than or equal to " + maxTargetValue + " categories for target variable, but found " + values.size()); if (maxTargetValue == Integer.MAX_VALUE) { maxTargetValue = values.size(); } for (String value : values) { targetDictionary.intern(value); } }
@Override public List<String> getTargetCategories() { List<String> r = targetDictionary.values(); if (r.size() > maxTargetValue) { r.subList(maxTargetValue, r.size()).clear(); } return r; }
/** * Construct a parser for CSV lines that encodes the parsed data in vector form. * @param targetName The name of the target variable. * @param typeMap A map describing the types of the predictor variables. */ public CsvRecordFactory(String targetName, Map<String, String> typeMap) { this.targetName = targetName; this.typeMap = typeMap; targetDictionary = new Dictionary(); }
public static Dictionary fromList(Iterable<String> values) { Dictionary dict = new Dictionary(); for (String value : values) { dict.intern(value); } return dict; } }
/*** * Extract the corresponding raw target label according to a code * @param code the integer code encoded during training process * @return the raw target label */ public String getTargetLabel(int code) { for (String key : targetDictionary.values()) { if (targetDictionary.intern(key) == code) { return key; } } return null; }
/** * Defines the values and thus the encoding of values of the target variables. Note * that any values of the target variable not present in this list will be given the * value of the last member of the list. * @param values The values the target variable can have. */ @Override public void defineTargetCategories(List<String> values) { Preconditions.checkArgument( values.size() <= maxTargetValue, "Must have less than or equal to " + maxTargetValue + " categories for target variable, but found " + values.size()); if (maxTargetValue == Integer.MAX_VALUE) { maxTargetValue = values.size(); } for (String value : values) { targetDictionary.intern(value); } }
@Override public List<String> getTargetCategories() { List<String> r = targetDictionary.values(); if (r.size() > maxTargetValue) { r.subList(maxTargetValue, r.size()).clear(); } return r; }
/** * Construct a parser for CSV lines that encodes the parsed data in vector form. * @param targetName The name of the target variable. * @param typeMap A map describing the types of the predictor variables. */ public CsvRecordFactory(String targetName, Map<String, String> typeMap) { this.targetName = targetName; this.typeMap = typeMap; targetDictionary = new Dictionary(); }
public static Dictionary fromList(Iterable<String> values) { Dictionary dict = new Dictionary(); for (String value : values) { dict.intern(value); } return dict; } }
/*** * Extract the corresponding raw target label according to a code * @param code the integer code encoded during training process * @return the raw target label */ public String getTargetLabel(int code) { for (String key : targetDictionary.values()) { if (targetDictionary.intern(key) == code) { return key; } } return null; }
/*** * Decodes a single line of CSV data and records the target(if retrunTarget is true) * and predictor variables in a record. As a side effect, features are added into the featureVector. * Returns the value of the target variable. When used during classify against production data without * target value, the method will be called with returnTarget = false. * @param line The raw data. * @param featureVector Where to fill in the features. Should be zeroed before calling * processLine. * @param returnTarget whether process and return target value, -1 will be returned if false. * @return The value of the target variable. */ public int processLine(CharSequence line, Vector featureVector, boolean returnTarget) { List<String> values = parseCsvLine(line); int targetValue = -1; if (returnTarget) { targetValue = targetDictionary.intern(values.get(target)); if (targetValue >= maxTargetValue) { targetValue = maxTargetValue - 1; } } for (Integer predictor : predictors) { String value = predictor >= 0 ? values.get(predictor) : null; predictorEncoders.get(predictor).addToVector(value, featureVector); } return targetValue; }
private static void dissect(Dictionary newsGroups, AdaptiveLogisticRegression learningAlgorithm, Iterable<File> files) throws IOException { CrossFoldLearner model = learningAlgorithm.getBest().getPayload().getLearner(); model.close(); Map<String, Set<Integer>> traceDictionary = Maps.newTreeMap(); ModelDissector md = new ModelDissector(); encoder.setTraceDictionary(traceDictionary); bias.setTraceDictionary(traceDictionary); for (File file : permute(files, rand).subList(0, 500)) { traceDictionary.clear(); Vector v = encodeFeatureVector(file); md.update(v, traceDictionary, model); } List<String> ngNames = Lists.newArrayList(newsGroups.values()); List<ModelDissector.Weight> weights = md.summary(100); for (ModelDissector.Weight w : weights) { System.out.printf("%s\t%.1f\t%s\t%.1f\t%s\t%.1f\t%s\n", w.getFeature(), w.getWeight(), ngNames.get(w.getMaxImpact() + 1), w.getCategory(1), w.getWeight(1), w.getCategory(2), w.getWeight(2)); } }
FeatureVectorEncoder lines = new ConstantValueEncoder("Lines"); lines.setTraceDictionary(traceDictionary); Dictionary newsGroups = new Dictionary(); newsGroups.intern(newsgroup.getName()); files.addAll(Arrays.asList(newsgroup.listFiles()));
/*** * Decodes a single line of CSV data and records the target(if retrunTarget is true) * and predictor variables in a record. As a side effect, features are added into the featureVector. * Returns the value of the target variable. When used during classify against production data without * target value, the method will be called with returnTarget = false. * @param line The raw data. * @param featureVector Where to fill in the features. Should be zeroed before calling * processLine. * @param returnTarget whether process and return target value, -1 will be returned if false. * @return The value of the target variable. */ public int processLine(CharSequence line, Vector featureVector, boolean returnTarget) { List<String> values = parseCsvLine(line); int targetValue = -1; if (returnTarget) { targetValue = targetDictionary.intern(values.get(target)); if (targetValue >= maxTargetValue) { targetValue = maxTargetValue - 1; } } for (Integer predictor : predictors) { String value = predictor >= 0 ? values.get(predictor) : null; predictorEncoders.get(predictor).addToVector(value, featureVector); } return targetValue; }