/** * Returns a measure of how good the classification for a particular example * actually is. * * @param actual The correct category for the example. * @param data The vector to be classified. * @return The log likelihood of the correct answer as estimated by the current model. This will always be <= 0 * and larger (closer to 0) indicates better accuracy. In order to simplify code that maintains eunning averages, * we bound this value at -100. */ public double logLikelihood(int actual, Vector data) { if (numCategories() == 2) { double p = classifyScalar(data); if (actual > 0) { return Math.max(MIN_LOG_LIKELIHOOD, Math.log(p)); } else { return Math.max(MIN_LOG_LIKELIHOOD, Math.log1p(-p)); } } else { Vector p = classify(data); if (actual > 0) { return Math.max(MIN_LOG_LIKELIHOOD, Math.log(p.get(actual - 1))); } else { return Math.max(MIN_LOG_LIKELIHOOD, Math.log1p(-p.zSum())); } } } }
return classifyFull(new DenseVector(numCategories()), instance);
return classifyFull(new DenseVector(numCategories()), instance);
/** * Returns a measure of how good the classification for a particular example * actually is. * * @param actual The correct category for the example. * @param data The vector to be classified. * @return The log likelihood of the correct answer as estimated by the current model. This will always be <= 0 * and larger (closer to 0) indicates better accuracy. In order to simplify code that maintains eunning averages, * we bound this value at -100. */ public double logLikelihood(int actual, Vector data) { if (numCategories() == 2) { double p = classifyScalar(data); if (actual > 0) { return Math.max(MIN_LOG_LIKELIHOOD, Math.log(p)); } else { return Math.max(MIN_LOG_LIKELIHOOD, Math.log1p(-p)); } } else { Vector p = classify(data); if (actual > 0) { return Math.max(MIN_LOG_LIKELIHOOD, Math.log(p.get(actual - 1))); } else { return Math.max(MIN_LOG_LIKELIHOOD, Math.log1p(-p.zSum())); } } } }
return classifyFull(new DenseVector(numCategories()), instance);
/** * Returns a measure of how good the classification for a particular example * actually is. * * @param actual The correct category for the example. * @param data The vector to be classified. * @return The log likelihood of the correct answer as estimated by the current model. This will always be <= 0 * and larger (closer to 0) indicates better accuracy. In order to simplify code that maintains eunning averages, * we bound this value at -100. */ public double logLikelihood(int actual, Vector data) { if (numCategories() == 2) { double p = classifyScalar(data); if (actual > 0) { return Math.max(MIN_LOG_LIKELIHOOD, Math.log(p)); } else { return Math.max(MIN_LOG_LIKELIHOOD, Math.log1p(-p)); } } else { Vector p = classify(data); if (actual > 0) { return Math.max(MIN_LOG_LIKELIHOOD, Math.log(p.get(actual - 1))); } else { return Math.max(MIN_LOG_LIKELIHOOD, Math.log1p(-p.zSum())); } } } }
/** * Returns a vector of probabilities of category 1, one for each row * of a matrix. This only makes sense if there are exactly two categories, but * calling this method in that case can save a number of vector allocations. * * @param data The matrix whose rows are vectors to classify * @return A vector of scores, with one value per row of the input matrix. */ public Vector classifyScalar(Matrix data) { Preconditions.checkArgument(numCategories() == 2, "Can only call classifyScalar with two categories"); Vector r = new DenseVector(data.numRows()); for (int row = 0; row < data.numRows(); row++) { r.set(row, classifyScalar(data.viewRow(row))); } return r; }
/** * Returns a vector of probabilities of category 1, one for each row * of a matrix. This only makes sense if there are exactly two categories, but * calling this method in that case can save a number of vector allocations. * * @param data The matrix whose rows are vectors to classify * @return A vector of scores, with one value per row of the input matrix. */ public Vector classifyScalar(Matrix data) { Preconditions.checkArgument(numCategories() == 2, "Can only call classifyScalar with two categories"); Vector r = new DenseVector(data.numRows()); for (int row = 0; row < data.numRows(); row++) { r.set(row, classifyScalar(data.viewRow(row))); } return r; }
/** * Returns a vector of probabilities of category 1, one for each row * of a matrix. This only makes sense if there are exactly two categories, but * calling this method in that case can save a number of vector allocations. * * @param data The matrix whose rows are vectors to classify * @return A vector of scores, with one value per row of the input matrix. */ public Vector classifyScalar(Matrix data) { Preconditions.checkArgument(numCategories() == 2, "Can only call classifyScalar with two categories"); Vector r = new DenseVector(data.numRows()); for (int row = 0; row < data.numRows(); row++) { r.set(row, classifyScalar(data.viewRow(row))); } return r; }
/** * Returns a matrix where the rows of the matrix each contain {@code n} probabilities, one for each category. * * @param data The matrix whose rows are the input vectors to classify * @return A matrix of scores, one row per row of the input matrix, one column for each but the last category. */ public Matrix classifyFull(Matrix data) { Matrix r = new DenseMatrix(data.numRows(), numCategories()); for (int row = 0; row < data.numRows(); row++) { classifyFull(r.viewRow(row), data.viewRow(row)); } return r; }
/** * Returns a matrix where the rows of the matrix each contain {@code n} probabilities, one for each category. * * @param data The matrix whose rows are the input vectors to classify * @return A matrix of scores, one row per row of the input matrix, one column for each but the last category. */ public Matrix classifyFull(Matrix data) { Matrix r = new DenseMatrix(data.numRows(), numCategories()); for (int row = 0; row < data.numRows(); row++) { classifyFull(r.viewRow(row), data.viewRow(row)); } return r; }
/** * Returns a matrix where the rows of the matrix each contain {@code n} probabilities, one for each category. * * @param data The matrix whose rows are the input vectors to classify * @return A matrix of scores, one row per row of the input matrix, one column for each but the last category. */ public Matrix classifyFull(Matrix data) { Matrix r = new DenseMatrix(data.numRows(), numCategories()); for (int row = 0; row < data.numRows(); row++) { classifyFull(r.viewRow(row), data.viewRow(row)); } return r; }
/** * Returns n-1 probabilities, one for each categories 1 through * {@code n-1}, for each row of a matrix, where {@code n} is equal * to {@code numCategories()}. The probability of the missing 0-th * category is 1 - rowSum(this result). * * @param data The matrix whose rows are the input vectors to classify * @return A matrix of scores, one row per row of the input matrix, one column for each but the last category. */ public Matrix classify(Matrix data) { Matrix r = new DenseMatrix(data.numRows(), numCategories() - 1); for (int row = 0; row < data.numRows(); row++) { r.assignRow(row, classify(data.viewRow(row))); } return r; }
/** * Returns n-1 probabilities, one for each categories 1 through * {@code n-1}, for each row of a matrix, where {@code n} is equal * to {@code numCategories()}. The probability of the missing 0-th * category is 1 - rowSum(this result). * * @param data The matrix whose rows are the input vectors to classify * @return A matrix of scores, one row per row of the input matrix, one column for each but the last category. */ public Matrix classify(Matrix data) { Matrix r = new DenseMatrix(data.numRows(), numCategories() - 1); for (int row = 0; row < data.numRows(); row++) { r.assignRow(row, classify(data.viewRow(row))); } return r; }
/** * Returns n-1 probabilities, one for each categories 1 through * {@code n-1}, for each row of a matrix, where {@code n} is equal * to {@code numCategories()}. The probability of the missing 0-th * category is 1 - rowSum(this result). * * @param data The matrix whose rows are the input vectors to classify * @return A matrix of scores, one row per row of the input matrix, one column for each but the last category. */ public Matrix classify(Matrix data) { Matrix r = new DenseMatrix(data.numRows(), numCategories() - 1); for (int row = 0; row < data.numRows(); row++) { r.assignRow(row, classify(data.viewRow(row))); } return r; }
r.viewPart(1, numCategories() - 1).assign(classify(instance)); r.setQuick(0, 1.0 - r.zSum()); return r;
r.viewPart(1, numCategories() - 1).assign(classify(instance)); r.setQuick(0, 1.0 - r.zSum()); return r;
r.viewPart(1, numCategories() - 1).assign(classify(instance)); r.setQuick(0, 1.0 - r.zSum()); return r;
@Test public void toyData() throws Exception { TrainNaiveBayesJob trainNaiveBayes = new TrainNaiveBayesJob(); trainNaiveBayes.setConf(conf); trainNaiveBayes.run(new String[] { "--input", inputFile.getAbsolutePath(), "--output", outputDir.getAbsolutePath(), "-el", "--tempDir", tempDir.getAbsolutePath() }); NaiveBayesModel naiveBayesModel = NaiveBayesModel.materialize(new Path(outputDir.getAbsolutePath()), conf); AbstractVectorClassifier classifier = new StandardNaiveBayesClassifier(naiveBayesModel); assertEquals(2, classifier.numCategories()); Vector prediction = classifier.classifyFull(trainingInstance(COLOR_RED, TYPE_SUV, ORIGIN_DOMESTIC).get()); // should be classified as not stolen assertTrue(prediction.get(0) < prediction.get(1)); }
@Test public void toyDataComplementary() throws Exception { TrainNaiveBayesJob trainNaiveBayes = new TrainNaiveBayesJob(); trainNaiveBayes.setConf(conf); trainNaiveBayes.run(new String[] { "--input", inputFile.getAbsolutePath(), "--output", outputDir.getAbsolutePath(), "-el", "--trainComplementary", "--tempDir", tempDir.getAbsolutePath() }); NaiveBayesModel naiveBayesModel = NaiveBayesModel.materialize(new Path(outputDir.getAbsolutePath()), conf); AbstractVectorClassifier classifier = new ComplementaryNaiveBayesClassifier(naiveBayesModel); assertEquals(2, classifier.numCategories()); Vector prediction = classifier.classifyFull(trainingInstance(COLOR_RED, TYPE_SUV, ORIGIN_DOMESTIC).get()); // should be classified as not stolen assertTrue(prediction.get(0) < prediction.get(1)); }