@Test public void testEncodeNoNames() { NameSample nameSample = new NameSample("Once upon a time.".split(" "), new Span[] {}, true); String[] expected = new String[] { OTHER, OTHER, OTHER, OTHER}; String[] actual = codec.encode(nameSample.getNames(), nameSample.getSentence().length); Assert.assertArrayEquals("Only 'Other' is expected.", expected, actual); }
@Test public void testEncodeSingleTokenSpan() { String[] sentence = "I called Julie again.".split(" "); Span[] spans = new Span[] { new Span(2,3, A_TYPE)}; NameSample nameSample = new NameSample(sentence, spans, true); String[] expected = new String[] {OTHER, OTHER, A_START, OTHER}; String[] actual = codec.encode(nameSample.getNames(), nameSample.getSentence().length); Assert.assertArrayEquals("'Julie' should be 'start' only, the rest should be 'other'.", expected, actual); }
@Test public void testEncodeDoubleTokenSpan() { String[] sentence = "I saw Stefanie Schmidt today.".split(" "); Span[] span = new Span[] { new Span(2,4, A_TYPE)}; NameSample nameSample = new NameSample(sentence, span, true); String[] expected = new String[] {OTHER, OTHER, A_START, A_CONTINUE, OTHER}; String[] actual = codec.encode(nameSample.getNames(), nameSample.getSentence().length); Assert.assertArrayEquals("'Stefanie' should be 'start' only, 'Schmidt' is " + "'continue' and the rest should be 'other'.", expected, actual); }
@Test public void testEncodeDoubleTokenSpanNoType() { final String DEFAULT_START = "default" + "-" + BioCodec.START; final String DEFAULT_CONTINUE = "default" + "-" + BioCodec.CONTINUE; String[] sentence = "I saw Stefanie Schmidt today.".split(" "); Span[] span = new Span[] { new Span(2,4, null)}; NameSample nameSample = new NameSample(sentence, span, true); String[] expected = new String[] {OTHER, OTHER, DEFAULT_START, DEFAULT_CONTINUE, OTHER}; String[] actual = codec.encode(nameSample.getNames(), nameSample.getSentence().length); Assert.assertArrayEquals("'Stefanie' should be 'start' only, 'Schmidt' is " + "'continue' and the rest should be 'other'.", expected, actual); }
@Test public void testEncodeAdjacentSingleSpans() { String[] sentence = "something PersonA PersonB Something".split(" "); Span[] span = new Span[] { new Span(1,2, A_TYPE), new Span(2, 3, A_TYPE) }; NameSample nameSample = new NameSample(sentence, span, true); String[] expected = new String[] {OTHER, A_START, A_START, OTHER}; String[] actual = codec.encode(nameSample.getNames(), nameSample.getSentence().length); Assert.assertArrayEquals(expected, actual); }
@Test public void testEncodeAdjacentSpans() { String[] sentence = "something PersonA PersonA PersonB Something".split(" "); Span[] span = new Span[] { new Span(1,3, A_TYPE), new Span(3, 4, A_TYPE) }; NameSample nameSample = new NameSample(sentence, span, true); String[] expected = new String[] {OTHER, A_START, A_CONTINUE, A_START, OTHER}; String[] actual = codec.encode(nameSample.getNames(), nameSample.getSentence().length); Assert.assertArrayEquals(expected, actual); }
public synchronized static List<INDArray> mapToLabelVectors(NameSample sample, int windowSize, String[] labelStrings) { Map<String, Integer> labelToIndex = IntStream.range(0, labelStrings.length).boxed() .collect(Collectors.toMap(i -> labelStrings[i], i -> i)); List<INDArray> vectors = new ArrayList<INDArray>(); // encode the outcome as one-hot-representation String outcomes[] = new BioCodec().encode(sample.getNames(), sample.getSentence().length); for (int i = 0; i < sample.getSentence().length; i++) { INDArray labels = Nd4j.create(1, labelStrings.length, windowSize); labels.putScalar(new int[] { 0, labelToIndex.get(outcomes[i]), windowSize - 1 }, 1.0d); vectors.add(labels); } return vectors; }
static List<INDArray> mapToLabelVectors(NameSample sample, int windowSize, String[] labelStrings) { Map<String, Integer> labelToIndex = IntStream.range(0, labelStrings.length).boxed() .collect(Collectors.toMap(i -> labelStrings[i], i -> i)); List<INDArray> vectors = new ArrayList<INDArray>(); for (int i = 0; i < sample.getSentence().length; i++) { // encode the outcome as one-hot-representation String outcomes[] = new BioCodec().encode(sample.getNames(), sample.getSentence().length); INDArray labels = Nd4j.create(1, labelStrings.length, windowSize); labels.putScalar(new int[]{0, labelToIndex.get(outcomes[i]), windowSize - 1}, 1.0d); vectors.add(labels); } return vectors; }