public TextValueEncoder(String name) { super(name, 2); wordEncoder = new StaticWordValueEncoder(name); counts = HashMultiset.create(); }
public TextValueEncoder(String name) { super(name, 2); wordEncoder = new StaticWordValueEncoder(name); counts = HashMultiset.create(); }
public TextValueEncoder(String name) { super(name, 2); wordEncoder = new StaticWordValueEncoder(name); counts = HashMultiset.create(); }
public static ArgumentEncoder newWordEncoder(int position, String name) { ArgumentEncoder r = new ArgumentEncoder(position, name); r.encoder = new StaticWordValueEncoder(name); return r; }
public static void main(String[] args) throws IOException { FeatureVectorEncoder encoder = new StaticWordValueEncoder("text"); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31); StringReader in = new StringReader("text to magically vectorize"); TokenStream ts = analyzer.tokenStream("body", in); TermAttribute termAtt = ts.addAttribute(TermAttribute.class); Vector v1 = new RandomAccessSparseVector(100); while (ts.incrementToken()) { char[] termBuffer = termAtt.termBuffer(); int termLen = termAtt.termLength(); String w = new String(termBuffer, 0, termLen); encoder.addToVector(w, 1, v1); } System.out.printf("%s\n", new SequentialAccessSparseVector(v1)); }
@Test public void testAsString() { Locale.setDefault(Locale.ENGLISH); FeatureVectorEncoder enc = new StaticWordValueEncoder("word"); assertEquals("word:w1:1.0000", enc.asString("w1")); }
FeatureVectorEncoder encoder = new StaticWordValueEncoder("body"); encoder.setProbes(2); encoder.setTraceDictionary(traceDictionary);
@Test public void testAddToVector() { FeatureVectorEncoder enc = new StaticWordValueEncoder("word"); Vector v = new DenseVector(200); enc.addToVector("word1", v); enc.addToVector("word2", v); Iterator<Vector.Element> i = v.nonZeroes().iterator(); Iterator<Integer> j = ImmutableList.of(7, 118, 119, 199).iterator(); while (i.hasNext()) { Vector.Element element = i.next(); assertEquals(j.next().intValue(), element.index()); assertEquals(1, element.get(), 0); } assertFalse(j.hasNext()); }
@Test public void testTraceDictionary() { StaticWordValueEncoder encoder1 = new StaticWordValueEncoder("first"); StaticWordValueEncoder encoder2 = new StaticWordValueEncoder("second"); Map<String, Set<Integer>> traceDictionary = Maps.newHashMap(); InteractionValueEncoder interactions = new InteractionValueEncoder("interactions", encoder1, encoder2); interactions.setProbes(1); interactions.setTraceDictionary(traceDictionary); Vector v = new DenseVector(10); interactions.addInteractionToVector("a", "b", 1, v); assertEquals(1, v.getNumNonZeroElements()); assertEquals(1, traceDictionary.size()); assertEquals("interactions=a:b", getFirst(traceDictionary.keySet(), null)); }
@Test public void testStaticWeights() { StaticWordValueEncoder enc = new StaticWordValueEncoder("word"); enc.setDictionary(ImmutableMap.<String, Double>of("word1", 3.0, "word2", 1.5)); Vector v = new DenseVector(200); enc.addToVector("word1", v); enc.addToVector("word2", v); enc.addToVector("word3", v); Iterator<Vector.Element> i = v.nonZeroes().iterator(); Iterator<Integer> j = ImmutableList.of(7, 101, 118, 119, 152, 199).iterator(); Iterator<Double> k = ImmutableList.of(3.0, 0.75, 1.5, 1.5, 0.75, 3.0).iterator(); while (i.hasNext()) { Vector.Element element = i.next(); assertEquals(j.next().intValue(), element.index()); } i = v.nonZeroes().iterator(); while (i.hasNext()) { Vector.Element element = i.next(); assertEquals(String.format("checking v[%d]", element.index()), k.next(), element.get(), 0); } assertFalse(j.hasNext()); }
@Test public void testAddToVectorWithTextValueEncoder() { WordValueEncoder wv = new StaticWordValueEncoder("word"); TextValueEncoder tv = new TextValueEncoder("text"); InteractionValueEncoder enc = new InteractionValueEncoder("interactions", wv, tv); Vector v1 = new DenseVector(200); enc.addInteractionToVector("a","some text here",1.0, v1); int k = enc.getProbes(); // should interact "a" with each of "some","text" and "here" assertEquals((float) k*3, v1.norm(1), 0); }
@Test public void testAddToVectorUsesProductOfWeights() { WordValueEncoder wv = new StaticWordValueEncoder("word"); ContinuousValueEncoder cv = new ContinuousValueEncoder("cont"); InteractionValueEncoder enc = new InteractionValueEncoder("interactions", wv, cv); Vector v1 = new DenseVector(200); enc.addInteractionToVector("a","0.9",0.5, v1); int k = enc.getProbes(); // should set k distinct locations to 0.9*0.5 assertEquals((float) k*0.5*0.9, v1.norm(1), 0); assertEquals(0.5*0.9, v1.maxValue(), 0); }
@Test public void testAddToVector() { TextValueEncoder enc = new TextValueEncoder("text"); Vector v1 = new DenseVector(200); enc.addToVector("test1 and more", v1); enc.flush(1, v1); // should set 6 distinct locations to 1 assertEquals(6.0, v1.norm(1), 0); assertEquals(1.0, v1.maxValue(), 0); // now some fancy weighting StaticWordValueEncoder w = new StaticWordValueEncoder("text"); w.setDictionary(ImmutableMap.<String, Double>of("word1", 3.0, "word2", 1.5)); enc.setWordEncoder(w); // should set 6 locations to something Vector v2 = new DenseVector(200); enc.addToVector("test1 and more", v2); enc.flush(1, v2); // this should set the same 6 locations to the same values Vector v3 = new DenseVector(200); w.addToVector("test1", v3); w.addToVector("and", v3); w.addToVector("more", v3); assertEquals(0, v3.minus(v2).norm(1), 0); // moreover, the locations set in the unweighted case should be the same as in the weighted case assertEquals(v3.zSum(), v3.dot(v1), 0); }
@Test public void testAddToVector() { WordValueEncoder wv = new StaticWordValueEncoder("word"); ContinuousValueEncoder cv = new ContinuousValueEncoder("cont"); InteractionValueEncoder enc = new InteractionValueEncoder("interactions", wv, cv); Vector v1 = new DenseVector(200); enc.addInteractionToVector("a","1.0",1.0, v1); int k = enc.getProbes(); // should set k distinct locations to 1 assertEquals((float) k, v1.norm(1), 0); assertEquals(1.0, v1.maxValue(), 0); // adding same interaction again should increment weights enc.addInteractionToVector("a","1.0",1.0,v1); assertEquals((float) k*2, v1.norm(1), 0); assertEquals(2.0, v1.maxValue(), 0); Vector v2 = new DenseVector(20000); enc.addInteractionToVector("a","1.0",1.0,v2); wv.addToVector("a", v2); cv.addToVector("1.0", v2); k = enc.getProbes(); //this assumes no hash collision assertEquals((float) (k + wv.getProbes()+cv.getProbes()), v2.norm(1), 1.0e-3); }