/** * @see org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest#kolmogorovSmirnovTest(RealDistribution, double[]) * @since 3.3 */ public static double kolmogorovSmirnovTest(RealDistribution dist, double[] data) throws InsufficientDataException, NullArgumentException { return KS_TEST.kolmogorovSmirnovTest(dist, data); }
double[] xa = null; double[] ya = null; if (lengthProduct < LARGE_SAMPLE_PRODUCT && hasTies(x,y)) { xa = MathArrays.copyOf(x); ya = MathArrays.copyOf(y); fixTies(xa, ya); } else { xa = x; return exactP(kolmogorovSmirnovStatistic(xa, ya), x.length, y.length, strict); return approximateP(kolmogorovSmirnovStatistic(x, y), x.length, y.length);
/** * Computes {@code bootstrap(x, y, iterations, true)}. * This is equivalent to ks.boot(x,y, nboots=iterations) using the R Matching * package function. See #bootstrap(double[], double[], int, boolean). * * @param x first sample * @param y second sample * @param iterations number of bootstrap resampling iterations * @return estimated p-value */ public double bootstrap(double[] x, double[] y, int iterations) { return bootstrap(x, y, iterations, true); }
/** * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a> * evaluating the null hypothesis that {@code data} conforms to {@code distribution}. If * {@code exact} is true, the distribution used to compute the p-value is computed using * extended precision. See {@link #cdfExact(double, int)}. * * @param distribution reference distribution * @param data sample being being evaluated * @param exact whether or not to force exact computation of the p-value * @return the p-value associated with the null hypothesis that {@code data} is a sample from * {@code distribution} * @throws InsufficientDataException if {@code data} does not have length at least 2 * @throws NullArgumentException if {@code data} is null */ public double kolmogorovSmirnovTest(RealDistribution distribution, double[] data, boolean exact) { return 1d - cdf(kolmogorovSmirnovStatistic(distribution, data), data.length, exact); }
/** * Uses Monte Carlo simulation to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\) is the * 2-sample Kolmogorov-Smirnov statistic. See * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\). * <p> * The simulation generates {@code iterations} random partitions of {@code m + n} into an * {@code n} set and an {@code m} set, computing \(D_{n,m}\) for each partition and returning * the proportion of values that are greater than {@code d}, or greater than or equal to * {@code d} if {@code strict} is {@code false}. * </p> * * @param d D-statistic value * @param n first sample size * @param m second sample size * @param iterations number of random partitions to generate * @param strict whether or not the probability to compute is expressed as a strict inequality * @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\) * greater than (resp. greater than or equal to) {@code d} */ public double monteCarloP(final double d, final int n, final int m, final boolean strict, final int iterations) { return integralMonteCarloP(calculateIntegralD(d, n, m, strict), n, m, iterations); }
/** * Computes \(P(D_{n,m} > d)\) if {@code strict} is {@code true}; otherwise \(P(D_{n,m} \ge * d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\). * <p> * The returned probability is exact, implemented by unwinding the recursive function * definitions presented in [4] (class javadoc). * </p> * * @param d D-statistic value * @param n first sample size * @param m second sample size * @param strict whether or not the probability to compute is expressed as a strict inequality * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\) * greater than (resp. greater than or equal to) {@code d} */ public double exactP(double d, int n, int m, boolean strict) { return 1 - n(m, n, m, n, calculateIntegralD(d, m, n, strict), strict) / CombinatoricsUtils.binomialCoefficientDouble(n + m, m); }
/** * Test if two clusters are significantly different in the metrics we look at for balancing. * * @param orig the utilization matrix from the original cluster * @param optimized the utilization matrix from the optimized cluster * @return The P value that the various derived resources come from the same probability distribution. The probability * that the null hypothesis is correct. */ public static double[] testDifference(double[][] orig, double[][] optimized) { int nResources = RawAndDerivedResource.values().length; if (orig.length != nResources) { throw new IllegalArgumentException("orig must have number of rows equal to RawAndDerivedResource."); } if (optimized.length != nResources) { throw new IllegalArgumentException("optimized must have number of rows equal to RawAndDerivedResource."); } if (orig[0].length != optimized[0].length) { throw new IllegalArgumentException("The number of brokers must be the same."); } double[] pValues = new double[orig.length]; //TODO: For small N we want to do statistical bootstrapping (not the same as bootstrapping data). for (int resourceIndex = 0; resourceIndex < nResources; resourceIndex++) { RandomGenerator rng = new MersenneTwister(0x5d11121018463324L); KolmogorovSmirnovTest kolmogorovSmirnovTest = new KolmogorovSmirnovTest(rng); pValues[resourceIndex] = kolmogorovSmirnovTest.kolmogorovSmirnovTest(orig[resourceIndex], optimized[resourceIndex]); } return pValues; }
KolmogorovSmirnovTest ks = new KolmogorovSmirnovTest(); double[] data = ((List<?>)second).stream().mapToDouble(item -> ((Number)item).doubleValue()).toArray(); m.put("p-value", ks.kolmogorovSmirnovTest(realDistribution, data)); m.put("d-statistic", ks.kolmogorovSmirnovStatistic(realDistribution, data)); return new Tuple(m); m.put("d-statistic", ks.kolmogorovSmirnovTest(data, data2)); return new Tuple(m);
/** * @see org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest#kolmogorovSmirnovStatistic(RealDistribution, double[]) * @since 3.3 */ public static double kolmogorovSmirnovStatistic(RealDistribution dist, double[] data) throws InsufficientDataException, NullArgumentException { return KS_TEST.kolmogorovSmirnovStatistic(dist, data); }
/** * Calculates \(P(D_n < d)\) using the method described in [1] with quick decisions for extreme * values given in [2] (see above). The result is not exact as with * {@link #cdfExact(double, int)} because calculations are based on * {@code double} rather than {@link org.apache.commons.math3.fraction.BigFraction}. * * @param d statistic * @param n sample size * @return \(P(D_n < d)\) * @throws MathArithmeticException if algorithm fails to convert {@code h} to a * {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k * - h) / m\) for integer {@code k, m} and \(0 \le h < 1\) */ public double cdf(double d, int n) throws MathArithmeticException { return cdf(d, n, false); }
/** * @see org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest#approximateP(double, int, int) * @since 3.3 */ public static double approximateP(double d, int n, int m) { return KS_TEST.approximateP(d, n, m); }
public double getKStest() { final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest(); final double pValue = test.kolmogorovSmirnovTest(p1Counts, p2Counts); return pValue; }
/** * @see org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest#kolmogorovSmirnovStatistic(double[], double[]) * @since 3.3 */ public static double kolmogorovSmirnovStatistic(double[] x, double[] y) throws InsufficientDataException, NullArgumentException { return KS_TEST.kolmogorovSmirnovStatistic(x, y); }
/** * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a> * evaluating the null hypothesis that {@code data} conforms to {@code distribution}. If * {@code exact} is true, the distribution used to compute the p-value is computed using * extended precision. See {@link #cdfExact(double, int)}. * * @param distribution reference distribution * @param data sample being being evaluated * @param exact whether or not to force exact computation of the p-value * @return the p-value associated with the null hypothesis that {@code data} is a sample from * {@code distribution} * @throws InsufficientDataException if {@code data} does not have length at least 2 * @throws NullArgumentException if {@code data} is null */ public double kolmogorovSmirnovTest(RealDistribution distribution, double[] data, boolean exact) { return 1d - cdf(kolmogorovSmirnovStatistic(distribution, data), data.length, exact); }
/** * Computes \(P(D_{n,m} > d)\) if {@code strict} is {@code true}; otherwise \(P(D_{n,m} \ge * d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\). * <p> * The returned probability is exact, implemented by unwinding the recursive function * definitions presented in [4] (class javadoc). * </p> * * @param d D-statistic value * @param n first sample size * @param m second sample size * @param strict whether or not the probability to compute is expressed as a strict inequality * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\) * greater than (resp. greater than or equal to) {@code d} */ public double exactP(double d, int n, int m, boolean strict) { return 1 - n(m, n, m, n, calculateIntegralD(d, m, n, strict), strict) / CombinatoricsUtils.binomialCoefficientDouble(n + m, m); }
/** * Uses Monte Carlo simulation to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\) is the * 2-sample Kolmogorov-Smirnov statistic. See * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\). * <p> * The simulation generates {@code iterations} random partitions of {@code m + n} into an * {@code n} set and an {@code m} set, computing \(D_{n,m}\) for each partition and returning * the proportion of values that are greater than {@code d}, or greater than or equal to * {@code d} if {@code strict} is {@code false}. * </p> * * @param d D-statistic value * @param n first sample size * @param m second sample size * @param iterations number of random partitions to generate * @param strict whether or not the probability to compute is expressed as a strict inequality * @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\) * greater than (resp. greater than or equal to) {@code d} */ public double monteCarloP(final double d, final int n, final int m, final boolean strict, final int iterations) { return integralMonteCarloP(calculateIntegralD(d, n, m, strict), n, m, iterations); }
/** * Calculates {@code P(D_n < d)}. The result is exact in the sense that BigFraction/BigReal is * used everywhere at the expense of very slow execution time. Almost never choose this in real * applications unless you are very sure; this is almost solely for verification purposes. * Normally, you would choose {@link #cdf(double, int)}. See the class * javadoc for definitions and algorithm description. * * @param d statistic * @param n sample size * @return \(P(D_n < d)\) * @throws MathArithmeticException if the algorithm fails to convert {@code h} to a * {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k * - h) / m\) for integer {@code k, m} and \(0 \le h < 1\) */ public double cdfExact(double d, int n) throws MathArithmeticException { return cdf(d, n, true); }
/** * @see org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest#approximateP(double, int, int) * @since 3.3 */ public static double approximateP(double d, int n, int m) { return KS_TEST.approximateP(d, n, m); }