/** * Restrict the cardinality of the input to the top {@code n} values based on the * frequency of the lookup. This limiter is useful when the frequency of the values * is non-uniform and the most common are the most important. If there are many * values with roughly the same frequency, then it will use a {@link #first(int)} * limiter to keep the number of values within bounds. * * <p>The limiter will adjust to changes in the frequency over time, but it must also * protect against high rates of churn in the values. Keep in mind that this can cause * a delay in new high frequency value being used instead of being grouped as part of * {@link #OTHERS}.</p> * * @param n * Number of values to select. * @return * The input value if it is within the bounds or is selected. Otherwise map to * {@link #OTHERS}. */ public static Function<String, String> mostFrequent(int n) { return mostFrequent(n, Clock.SYSTEM); }
MostFrequentLimiter(int n, Clock clock) { this.n = n; this.clock = clock; this.limiter = first(n); this.limiterTimestamp = clock.wallTime(); this.cutoff = 0L; this.updatesWithHighChurn = 0; }
/** * Restrict the cardinality of the input to the top {@code n} values based on the * frequency of the lookup. This limiter is useful when the frequency of the values * is non-uniform and the most common are the most important. If there are many * values with roughly the same frequency, then it will use a {@link #first(int)} * limiter to keep the number of values within bounds. * * <p>The limiter will adjust to changes in the frequency over time, but it must also * protect against high rates of churn in the values. Keep in mind that this can cause * a delay in new high frequency value being used instead of being grouped as part of * {@link #OTHERS}.</p> * * @param n * Number of values to select. * @return * The input value if it is within the bounds or is selected. Otherwise map to * {@link #OTHERS}. */ public static Function<String, String> mostFrequent(int n) { return mostFrequent(n, Clock.SYSTEM); }
MostFrequentLimiter(int n, Clock clock) { this.n = n; this.clock = clock; this.limiter = first(n); this.limiterTimestamp = clock.wallTime(); this.cutoff = 0L; this.updatesWithHighChurn = 0; }
/** * Return the cardinality limiter for a given key. This is used to protect the metrics * backend from a metrics explosion if some dimensions have a high cardinality. */ Function<String, String> limiterForKey(String key) { return Utils.computeIfAbsent(limiters, key, k -> CardinalityLimiters.mostFrequent(10)); }
/** * Return the cardinality limiter for a given key. This is used to protect the metrics * backend from a metrics explosion if some dimensions have a high cardinality. */ Function<String, String> limiterForKey(String key) { return Utils.computeIfAbsent(limiters, key, k -> CardinalityLimiters.mostFrequent(10)); }
@Test public void mostFrequentAllUnique() { // Ensure we have a somewhat stable set and there isn't a memory leak if every value is // unique. For example, if a user tried to use a request id. ManualClock clock = new ManualClock(0L, 0L); Function<String, String> f = CardinalityLimiters.mostFrequent(2, clock); Set<String> values = new TreeSet<>(); for (int i = 0; i < 10000; ++i) { values.add(f.apply("" + i)); clock.setWallTime(i * 1000); } // The values less than equal 9616 should have been cleaned up based on the clock Assertions.assertFalse(f.toString().contains("9616")); Assertions.assertEquals(3, values.size()); }
@Test public void firstToStringEmpty() { Function<String, String> f = CardinalityLimiters.first(2); Assertions.assertEquals("FirstLimiter()", f.toString()); }
@Test public void mostFrequentTemporaryChurn() { ManualClock clock = new ManualClock(0L, 0L); Function<String, String> f = CardinalityLimiters.mostFrequent(2, clock); Set<String> values = new TreeSet<>(); for (int t = 0; t < 250; ++t) { if (t < 100) { values.add(f.apply("a")); } else if (t < 117) { // Simulates 17 minutes of high churn for (int i = 0; i < 200; ++i) { values.add(f.apply("" + i)); } } else { // This should come through within 2h values.add(f.apply("b")); } clock.setWallTime(t * 60000); } Assertions.assertEquals(6, values.size()); Assertions.assertEquals("b", f.apply("b")); }
@Test public void firstToStringPartial() { Function<String, String> f = CardinalityLimiters.first(2); Assertions.assertEquals("b", f.apply("b")); Assertions.assertEquals("FirstLimiter(b)", f.toString()); }
@Test public void mostFrequentUnderLimit() { int n = 27; ManualClock clock = new ManualClock(0L, 0L); Function<String, String> f = CardinalityLimiters.mostFrequent(n, clock); for (int t = 0; t < 1000; ++t) { for (int i = 0; i < n; ++i) { Assertions.assertEquals("" + i, f.apply("" + i)); } clock.setWallTime(t * 1000); } }
@Test public void firstToStringFull() { Function<String, String> f = CardinalityLimiters.first(2); Assertions.assertEquals("a", f.apply("a")); Assertions.assertEquals("b", f.apply("b")); Assertions.assertEquals("FirstLimiter(a,b)", f.toString()); }
List<Function<String, String>> limiters = new ArrayList<>(); for (int i = 0; i < clusterSize; ++i) { limiters.add(CardinalityLimiters.mostFrequent(cardinalityLimit, clock));
@Test public void first2() { Function<String, String> f = CardinalityLimiters.first(2); Assertions.assertEquals("a", f.apply("a")); Assertions.assertEquals("b", f.apply("b")); Assertions.assertEquals(CardinalityLimiters.OTHERS, f.apply("c")); Assertions.assertEquals("a", f.apply("a")); }
List<Function<String, String>> limiters = new ArrayList<>(); for (int i = 0; i < clusterSize; ++i) { limiters.add(CardinalityLimiters.mostFrequent(cardinalityLimit, clock));
@Test public void mostFrequentIsUsed() { ManualClock clock = new ManualClock(0L, 0L); Function<String, String> f = CardinalityLimiters.mostFrequent(2, clock); // Setup some basic stats updateN(f, 4, "a"); updateN(f, 3, "b"); updateN(f, 2, "c"); updateN(f, 1, "d"); // Refresh cutoff, should be 3 for the top 2 advanceClock(clock); Assertions.assertEquals("a", f.apply("a")); // If the values are close then bias towards the names that come first alphabetically Assertions.assertEquals(CardinalityLimiters.OTHERS, f.apply("c")); Assertions.assertEquals("b", f.apply("b")); // Until the cutoff is updated, "d" won't show up no matter how frequent Assertions.assertEquals(CardinalityLimiters.OTHERS, f.apply("d")); updateN(f, 42, "d"); Assertions.assertEquals(CardinalityLimiters.OTHERS, f.apply("d")); // Now "d" is most frequent advanceClock(clock); Assertions.assertEquals("d", f.apply("d")); }