org.apache.mahout.classifier.sgd.AdaptiveLogisticRegression java code examples

/**
 * How often should the evolutionary optimization of learning parameters occur?
 *
 * @param interval Number of training examples to use in each epoch of optimization.
 */
public void setInterval(int interval) {
 setInterval(interval, interval);
}

@Override
public void train(long trackingKey, int actual, Vector instance) {
 train(trackingKey, null, actual, instance);
}

public int nextStep(int recordNumber) {
 int stepSize = stepSize(recordNumber, 2.6);
 if (stepSize < minInterval) {
  stepSize = minInterval;
 }
 if (stepSize > maxInterval) {
  stepSize = maxInterval;
 }
 int newCutoff = stepSize * (recordNumber / stepSize + 1);
 if (newCutoff < cutoff + currentStep) {
  newCutoff = cutoff + currentStep;
 } else {
  this.currentStep = stepSize;
 }
 return newCutoff;
}

AdaptiveLogisticRegression learningAlgorithm = new AdaptiveLogisticRegression(20, FEATURES, new L1());
learningAlgorithm.setInterval(800);
learningAlgorithm.setAveragingWindow(500);
 learningAlgorithm.train(actual, v);
 State<AdaptiveLogisticRegression.Wrapper, CrossFoldLearner> best = learningAlgorithm.getBest();
 double maxBeta;
 double nonZeros;
  norm = beta.aggregate(Functions.PLUS, Functions.ABS);
  lambda = learningAlgorithm.getBest().getMappedParams()[0];
  mu = learningAlgorithm.getBest().getMappedParams()[1];
 } else {
  maxBeta = 0;
  if (learningAlgorithm.getBest() != null) {
   ModelSerializer.writeBinary("/tmp/news-group-" + k + ".model",
    learningAlgorithm.getBest().getPayload().getLearner().getModels().get(0));
learningAlgorithm.close();
dissect(newsGroups, learningAlgorithm, files);
System.out.println("exiting main");
              learningAlgorithm.getBest().getPayload().getLearner().getModels().get(0));

AdaptiveLogisticRegression adaptiveLogisticRegression = new AdaptiveLogisticRegression(2, 200, new L1());
adaptiveLogisticRegression.setInterval(1000);
 adaptiveLogisticRegression.train(r.getKey(), r.getActual(), r.getInstance());
 if (i % 1000 == 0 && adaptiveLogisticRegression.getBest() != null) {
  System.out.printf("%10d %10.4f %10.8f %.3f\n",
           i, adaptiveLogisticRegression.auc(),
           Math.log10(adaptiveLogisticRegression.getBest().getMappedParams()[0]), adaptiveLogisticRegression.getBest().getMappedParams()[1]);
assertEquals(1, adaptiveLogisticRegression.auc(), 0.1);
adaptiveLogisticRegression.close();

@Test
@ThreadLeakLingering(linger = 1000)
public void constantStep() {
 AdaptiveLogisticRegression lr = new AdaptiveLogisticRegression(2, 1000, new L1());
 lr.setInterval(5000);
 assertEquals(20000, lr.nextStep(15000));
 assertEquals(20000, lr.nextStep(15001));
 assertEquals(20000, lr.nextStep(16500));
 assertEquals(20000, lr.nextStep(19999));
 lr.close(); 
}

@ThreadLeakLingering(linger = 1000)
@Test
public void adaptiveLogisticRegressionRoundTrip() throws IOException {
 AdaptiveLogisticRegression learner = new AdaptiveLogisticRegression(2, 5, new L1());
 learner.setInterval(200);
 train(learner, 400);
 AdaptiveLogisticRegression olr3 = roundTrip(learner, AdaptiveLogisticRegression.class);
 double auc1 = learner.auc();
 assertTrue(auc1 > 0.85);
 assertEquals(auc1, learner.auc(), 1.0e-6);
 assertEquals(auc1, olr3.auc(), 1.0e-6);
 train(learner, 1000);
 train(learner, 1000);
 train(olr3, 1000);
 assertEquals(learner.auc(), learner.auc(), 0.005);
 assertEquals(learner.auc(), olr3.auc(), 0.005);
 double auc2 = learner.auc();
 assertTrue(String.format("%.3f > %.3f", auc2, auc1), auc2 > auc1);
 learner.close();
 olr3.close();
}

public void setPoolSize(int poolSize) {
 this.poolSize = poolSize;
 setupOptimizer(poolSize);
}

@Override
public void train(long trackingKey, String groupKey, int actual, Vector instance) {
 record++;
 buffer.add(new TrainingExample(trackingKey, groupKey, actual, instance));
 //don't train until we have enough examples
 if (buffer.size() > bufferSize) {
  trainWithBufferedExamples();
 }
}

cutoff = nextStep(record);

/**
 *
 * @param numCategories The number of categories (labels) to train on
 * @param numFeatures The number of features used in creating the vectors (i.e. the cardinality of the vector)
 * @param prior The {@link org.apache.mahout.classifier.sgd.PriorFunction} to use
 * @param threadCount The number of threads to use for training
 * @param poolSize The number of {@link org.apache.mahout.classifier.sgd.CrossFoldLearner} to use.
 */
public AdaptiveLogisticRegression(int numCategories, int numFeatures, PriorFunction prior, int threadCount,
  int poolSize) {
 this.numFeatures = numFeatures;
 this.threadCount = threadCount;
 this.poolSize = poolSize;
 seed = new State<Wrapper, CrossFoldLearner>(new double[2], 10);
 Wrapper w = new Wrapper(numCategories, numFeatures, prior);
 seed.setPayload(w);
 Wrapper.setMappings(seed);
 seed.setPayload(w);
 setPoolSize(this.poolSize);
}

private static void dissect(Dictionary newsGroups,
              AdaptiveLogisticRegression learningAlgorithm,
              Iterable<File> files) throws IOException {
 CrossFoldLearner model = learningAlgorithm.getBest().getPayload().getLearner();
 model.close();
 Map<String, Set<Integer>> traceDictionary = Maps.newTreeMap();
 ModelDissector md = new ModelDissector();
 encoder.setTraceDictionary(traceDictionary);
 bias.setTraceDictionary(traceDictionary);
 for (File file : permute(files, rand).subList(0, 500)) {
  traceDictionary.clear();
  Vector v = encodeFeatureVector(file);
  md.update(v, traceDictionary, model);
 }
 List<String> ngNames = Lists.newArrayList(newsGroups.values());
 List<ModelDissector.Weight> weights = md.summary(100);
 for (ModelDissector.Weight w : weights) {
  System.out.printf("%s\t%.1f\t%s\t%.1f\t%s\t%.1f\t%s\n",
           w.getFeature(), w.getWeight(), ngNames.get(w.getMaxImpact() + 1),
           w.getCategory(1), w.getWeight(1), w.getCategory(2), w.getWeight(2));
 }
}

 @Test
 @ThreadLeakLingering(linger = 1000)
 public void growingStep() {
  AdaptiveLogisticRegression lr = new AdaptiveLogisticRegression(2, 1000, new L1());
  lr.setInterval(2000, 10000);

  // start with minimum step size
  for (int i = 2000; i < 20000; i+=2000) {
   assertEquals(i + 2000, lr.nextStep(i));
  }

  // then level up a bit
  for (int i = 20000; i < 50000; i += 5000) {
   assertEquals(i + 5000, lr.nextStep(i));
  }

  // and more, but we top out with this step size
  for (int i = 50000; i < 500000; i += 10000) {
   assertEquals(i + 10000, lr.nextStep(i));
  }
  lr.close();
 }
}

public void setThreadCount(int threadCount) {
 this.threadCount = threadCount;
 setupOptimizer(poolSize);
}

@Override
public void train(long trackingKey, String groupKey, int actual, Vector instance) {
 record++;
 buffer.add(new TrainingExample(trackingKey, groupKey, actual, instance));
 //don't train until we have enough examples
 if (buffer.size() > bufferSize) {
  trainWithBufferedExamples();
 }
}

cutoff = nextStep(record);

/**
 *
 * @param numCategories The number of categories (labels) to train on
 * @param numFeatures The number of features used in creating the vectors (i.e. the cardinality of the vector)
 * @param prior The {@link org.apache.mahout.classifier.sgd.PriorFunction} to use
 * @param threadCount The number of threads to use for training
 * @param poolSize The number of {@link org.apache.mahout.classifier.sgd.CrossFoldLearner} to use.
 */
public AdaptiveLogisticRegression(int numCategories, int numFeatures, PriorFunction prior, int threadCount,
  int poolSize) {
 this.numFeatures = numFeatures;
 this.threadCount = threadCount;
 this.poolSize = poolSize;
 seed = new State<Wrapper, CrossFoldLearner>(new double[2], 10);
 Wrapper w = new Wrapper(numCategories, numFeatures, prior);
 seed.setPayload(w);
 Wrapper.setMappings(seed);
 seed.setPayload(w);
 setPoolSize(this.poolSize);
}

@Override
public void train(int actual, Vector instance) {
 train(record, null, actual, instance);
}

public void setThreadCount(int threadCount) {
 this.threadCount = threadCount;
 setupOptimizer(poolSize);
}

/**
 * How often should the evolutionary optimization of learning parameters occur?
 *
 * @param interval Number of training examples to use in each epoch of optimization.
 */
public void setInterval(int interval) {
 setInterval(interval, interval);
}

Javadoc

This is a meta-learner that maintains a pool of ordinary org.apache.mahout.classifier.sgd.OnlineLogisticRegression learners. Each member of the pool has different learning rates. Whichever of the learners in the pool falls behind in terms of average log-likelihood will be tossed out and replaced with variants of the survivors. This will let us automatically derive an annealing schedule that optimizes learning speed. Since on-line learners tend to be IO bound anyway, it doesn't cost as much as it might seem that it would to maintain multiple learners in memory. Doing this adaptation on-line as we learn also decreases the number of learning rate parameters required and replaces the normal hyper-parameter search.

One wrinkle is that the pool of learners that we maintain is actually a pool of org.apache.mahout.classifier.sgd.CrossFoldLearner which themselves contain several OnlineLogisticRegression objects. These pools allow estimation of performance on the fly even if we make many passes through the data. This does, however, increase the cost of training since if we are using 5-fold cross-validation, each vector is used 4 times for training and once for classification. If this becomes a problem, then we should probably use a 2-way unbalanced train/test split rather than full cross validation. With the current default settings, we have 100 learners running. This is better than the alternative of running hundreds of training passes to find good hyper-parameters because we only have to parse and feature-ize our inputs once. If you already have good hyper-parameters, then you might prefer to just run one CrossFoldLearner with those settings.

The fitness used here is AUC. Another alternative would be to try log-likelihood, but it is much easier to get bogus values of log-likelihood than with AUC and the results seem to accord pretty well. It would be nice to allow the fitness function to be pluggable. This use of AUC means that AdaptiveLogisticRegression is mostly suited for binary target variables. This will be fixed before long by extending OnlineAuc to handle non-binary cases or by using a different fitness value in non-binary cases.

Most used methods

setInterval
Starts optimization using the shorter interval and progresses to the longer using the specified numb
train
nextStep
stepSize
setPoolSize
setupOptimizer
trainWithBufferedExamples
<init>
close
getBest
auc
What is the AUC for the current best member of the population. If no member is best, usually because
setAveragingWindow

Popular in Java

Updating database using SQL prepared statement
getSharedPreferences (Context)
onCreateOptionsMenu (Activity)
requestLocationUpdates (LocationManager)
Format (java.text)
The base class for all formats. This is an abstract base class which specifies the protocol for clas
MessageFormat (java.text)
Produces concatenated messages in language-neutral way. New code should probably use java.util.Forma
UUID (java.util)
UUID is an immutable representation of a 128-bit universally unique identifier (UUID). There are mul
Callable (java.util.concurrent)
A task that returns a result and may throw an exception. Implementors define a single method with no
ConcurrentHashMap (java.util.concurrent)
A plug-in replacement for JDK1.5 java.util.concurrent.ConcurrentHashMap. This version is based on or
ExecutorService (java.util.concurrent)
An Executor that provides methods to manage termination and methods that can produce a Future for tr
Top Vim plugins

How to useAdaptiveLogisticRegression in org.apache.mahout.classifier.sgd

Best Java code snippets using org.apache.mahout.classifier.sgd.AdaptiveLogisticRegression (Showing top 20 results out of 315)

How to use
AdaptiveLogisticRegression
in
org.apache.mahout.classifier.sgd