protected Set<Feature> getFeatureSet(FrequencyDistribution<String> fd) throws TextClassificationException { /* * Instead of iterating all top-k ngrams comparing them to all document ngrams for each * iteration (expensive for large top-Ks),we build all features that might be created only once. * We copy this feature map then for each call, which is cheaper and update only the values of those ngrams that are found. * (TH 2018-09-23) */ Set<Feature> features = new HashSet<>(prepFeatSet); for (String ng : fd.getKeys()) { if (topKSet.contains(ng)) { // remove default value from set, i.e. feature name and value are part of the // features identity. Thus, remove feature with value 0 and add new one with value // 1. Just adding the same feature with a new value will NOT override the existing // entry. Feature feature = new Feature(getFeaturePrefix() + "_" + ng, 0, true, FeatureType.BOOLEAN); features.remove(feature); //Set value to 1, i.e. feature found and mark the feature value as non-default value feature.setValue(1); feature.setDefault(false); //add to set features.add(feature); } } return features; }