private static JavaPairRDD<String,float[]> readFeaturesRDD(JavaSparkContext sparkContext, Path path) { log.info("Loading features RDD from {}", path); JavaRDD<String> featureLines = sparkContext.textFile(path.toString()); return featureLines.mapToPair(line -> { List<?> update = TextUtils.readJSON(line, List.class); String key = update.get(0).toString(); float[] vector = TextUtils.convertViaJSON(update.get(1), float[].class); return new Tuple2<>(key, vector); }); }
private String toUpdateJSON(String matrix, String ID, float[] vector, String otherID) { List<?> args; if (noKnownItems) { args = Arrays.asList(matrix, ID, vector); } else { args = Arrays.asList(matrix, ID, vector, Collections.singletonList(otherID)); } return TextUtils.joinJSON(args); }
private String nearestClusterID(String datum) throws OryxServingException { check(datum != null && !datum.isEmpty(), "Data is needed to cluster"); String[] tokens = TextUtils.parseDelimited(datum, ','); ClusteringServingModel model = (ClusteringServingModel) getServingModel(); int nearestID; try { nearestID = model.nearestClusterID(tokens); } catch (IllegalArgumentException iae) { throw new OryxServingException(Response.Status.BAD_REQUEST, iae.getMessage()); } return Integer.toString(nearestID); }
KeyMessage<String,String> update = updates.get(i); assertEquals("UP", update.getKey()); List<?> fields = TextUtils.readJSON(update.getMessage(), List.class); int clusterID = (Integer) fields.get(0); double[] updatedCenter = TextUtils.convertViaJSON(fields.get(1), double[].class); int updatedClusterSize = (Integer) fields.get(2); clusterInfos.put(clusterID, new ClusterInfo(clusterID, updatedCenter, updatedClusterSize)); Cluster cluster = clusters.get(id); String[] tokens = TextUtils.parseDelimited(cluster.getArray().getValue(), ' '); double[] modelCenter = VectorMath.parseVector(tokens);
@Test public void testConvertViaJSON() { assertEquals(3, TextUtils.convertViaJSON("3", Long.class).longValue()); assertArrayEquals(new float[] { 1.0f, 2.0f }, TextUtils.convertViaJSON(new double[] { 1.0, 2.0 }, float[].class)); }
@Test public void testJoinPMMLDelimited() { assertEquals("ab \"a b\" \"with \\\"quotes\\\" \"", TextUtils.joinPMMLDelimited(Arrays.asList("ab", "a b", "with \"quotes\" "))); assertEquals("1 22 3", TextUtils.joinPMMLDelimited(Arrays.asList("1", "22", "3"))); assertEquals("\" c\\\" d \\\"e \" \" c\\\" d \\\"e \"", TextUtils.joinPMMLDelimited(Arrays.asList(" c\" d \"e ", " c\" d \"e "))); }
@Test public void testJoinDelimited() { assertEquals("1,2,3", TextUtils.joinDelimited(Arrays.asList("1", "2", "3"), ',')); assertEquals("\"a,b\"", TextUtils.joinDelimited(Arrays.asList("a,b"), ',')); assertEquals("\"\"\"a\"\"\"", TextUtils.joinDelimited(Arrays.asList("\"a\""), ',')); assertEquals("1 2 3", TextUtils.joinDelimited(Arrays.asList("1", "2", "3"), ' ')); assertEquals("\"1 \" \"2 \" 3", TextUtils.joinDelimited(Arrays.asList("1 ", "2 ", "3"), ' ')); assertEquals("\"\"\"a\"\"\"", TextUtils.joinDelimited(Arrays.asList("\"a\""), ' ')); assertEquals("\"\"\"\" \"\"\"\"\"\"", TextUtils.joinDelimited(Arrays.asList("\"", "\"\""), ' ')); assertEquals("", TextUtils.joinDelimited(Collections.emptyList(), '\t')); }
/** * @param values {@code double} value to make into a PMML {@link Array} * @return PMML {@link Array} representation */ public static Array toArray(double... values) { List<Double> valueList = new ArrayList<>(values.length); for (double value : values) { valueList.add(value); } String arrayValue = TextUtils.joinPMMLDelimitedNumbers(valueList); return new Array(Array.Type.REAL, arrayValue).setN(valueList.size()); }
/** * @param pmml PMML model to query for extensions * @param name name of extension to query * @return content of the extension, parsed as if it were a PMML {@link Array}: * space-separated values, with PMML quoting rules */ public static List<String> getExtensionContent(PMML pmml, String name) { return pmml.getExtensions().stream().filter(extension -> name.equals(extension.getName())).findFirst(). map(extension -> { List<?> content = extension.getContent(); Preconditions.checkArgument(content.size() <= 1); return content.isEmpty() ? Collections.<String>emptyList() : Arrays.asList(TextUtils.parsePMMLDelimited(content.get(0).toString())); }).orElse(null); }
List<?> update = TextUtils.readJSON(message, List.class); int treeID = Integer.parseInt(update.get(0).toString()); String nodeID = update.get(1).toString();
/** * @param pmml PMML model to add extension to, with a single {@code String} content and no value. * The content is encoded as if they were being added to a PMML {@link Array} and are * space-separated with PMML quoting rules * @param key extension key * @param content list of values to add as a {@code String} */ public static void addExtensionContent(PMML pmml, String key, Collection<?> content) { if (content.isEmpty()) { return; } String joined = TextUtils.joinPMMLDelimited(content); pmml.addExtensions(new Extension().setName(key).addContent(joined)); }
@Override public Pair<String,String> generate(int id, RandomGenerator random) { List<String> elements = new ArrayList<>(numberOfDimensions); for (int i = 0; i < numberOfDimensions; i++) { double d = random.nextDouble(); elements.add(Double.toString(d)); } return new Pair<>(Integer.toString(id), TextUtils.joinDelimited(elements, ',')); } }
@Test public void testJoinPMMLDelimitedNumbers() { assertEquals("-1.0 2.01 3.5", TextUtils.joinPMMLDelimitedNumbers(Arrays.asList(-1.0, 2.01, 3.5))); }
@Test public void testParsePMMLDelimited() { assertArrayEquals(new String[] {"1", "22", "3"}, TextUtils.parsePMMLDelimited("1 22 3")); assertArrayEquals(new String[] {"ab", "a b", "with \"quotes\" "}, TextUtils.parsePMMLDelimited("ab \"a b\" \"with \\\"quotes\\\" \" ")); assertArrayEquals(new String[] {"\" \""}, TextUtils.parsePMMLDelimited("\"\\\" \\\"\"")); assertArrayEquals(new String[] {" c\" d \"e ", " c\" d \"e "}, TextUtils.parsePMMLDelimited(" \" c\\\" d \\\"e \" \" c\\\" d \\\"e \" ")); }
@Override public void consumeKeyMessage(String key, String message, Configuration hadoopConf) throws IOException { switch (key) { case "UP": if (model == null) { return; // No model to interpret with yet, so skip it } List<?> update = TextUtils.readJSON(message, List.class); // Update int id = Integer.parseInt(update.get(0).toString()); double[] center = TextUtils.convertViaJSON(update.get(1), double[].class); long count = Long.parseLong(update.get(2).toString()); model.update(id, center, count); break; case "MODEL": case "MODEL-REF": log.info("Loading new model"); PMML pmml = AppPMMLUtils.readPMMLFromUpdateKeyMessage(key, message, hadoopConf); if (pmml == null) { return; } KMeansPMMLUtils.validatePMMLVsSchema(pmml, inputSchema); List<ClusterInfo> clusters = KMeansPMMLUtils.read(pmml); model = new KMeansServingModel(clusters, inputSchema); log.info("New model: {}", model); break; default: throw new IllegalArgumentException("Bad key: " + key); } }
@Override public void call(Iterator<Tuple2<String,float[]>> it) { if (it.hasNext()) { try (TopicProducer<String,String> producer = new TopicProducerImpl<>(updateBroker, topic, true)) { it.forEachRemaining(keyAndVector -> { String id = keyAndVector._1(); float[] vector = keyAndVector._2(); producer.send("UP", TextUtils.joinJSON(Arrays.asList(whichMatrix, id, vector))); }); } } }
KeyMessage<String, String> update = updates.get(i); assertEquals("UP", update.getKey()); List<?> fields = TextUtils.readJSON(update.getMessage(), List.class); int treeID = (Integer) fields.get(0); String nodeID = fields.get(1).toString(); KeyMessage<String, String> update1 = updates.get(i); KeyMessage<String, String> update2 = updates.get(i + 1); List<?> fields1 = TextUtils.readJSON(update1.getMessage(), List.class); List<?> fields2 = TextUtils.readJSON(update2.getMessage(), List.class); int count1 = (Integer) fields1.get(3); int count2 = (Integer) fields2.get(3);
@GET @Path("{datum}") @Produces({MediaType.TEXT_PLAIN, "text/csv", MediaType.APPLICATION_JSON}) public String get(@PathParam("datum") String datum) throws OryxServingException { check(datum != null && !datum.isEmpty(), "Data is needed to cluster"); KMeansServingModel model = (KMeansServingModel) getServingModel(); String[] tokens = TextUtils.parseDelimited(datum, ','); return model.closestCluster(KMeansUtils.featuresFromTokens(tokens, model.getInputSchema())).getSecond().toString(); }
private Predicate buildPredicate(Split split, CategoricalValueEncodings categoricalValueEncodings) { if (split == null) { // Left child always applies, but is evaluated second return new True(); } int featureIndex = inputSchema.predictorToFeatureIndex(split.feature()); FieldName fieldName = FieldName.create(inputSchema.getFeatureNames().get(featureIndex)); if (split.featureType().equals(FeatureType.Categorical())) { // Note that categories in MLlib model select the *left* child but the // convention here will be that the predicate selects the *right* child // So the predicate will evaluate "not in" this set // More ugly casting @SuppressWarnings("unchecked") Collection<Double> javaCategories = (Collection<Double>) (Collection<?>) JavaConversions.seqAsJavaList(split.categories()); Set<Integer> negativeEncodings = javaCategories.stream().map(Double::intValue).collect(Collectors.toSet()); Map<Integer,String> encodingToValue = categoricalValueEncodings.getEncodingValueMap(featureIndex); List<String> negativeValues = negativeEncodings.stream().map(encodingToValue::get).collect(Collectors.toList()); String joinedValues = TextUtils.joinPMMLDelimited(negativeValues); return new SimpleSetPredicate(fieldName, SimpleSetPredicate.BooleanOperator.IS_NOT_IN, new Array(Array.Type.STRING, joinedValues)); } else { // For MLlib, left means <= threshold, so right means > return new SimplePredicate(fieldName, SimplePredicate.Operator.GREATER_THAN) .setValue(Double.toString(split.threshold())); } }