private static String getNlpTypeIndicator(NlpEntityType nlpEntityType) { if (isPOSTokenType(nlpEntityType)) { return "POS"; } else { return "NE_ALL"; } }
@Override public NlpEntityOperator newOperator() { return new NlpEntityOperator(this); }
public List<Tuple> getQueryResults(String tableName, List<String> attributeNames, NlpEntityType nlpEntityType, int limit, int offset) throws Exception { ScanBasedSourceOperator scanSource = new ScanBasedSourceOperator(new ScanSourcePredicate(tableName)); NlpEntityPredicate nlpEntityPredicate = new NlpEntityPredicate(nlpEntityType, attributeNames, RESULTS); NlpEntityOperator nlpEntityOperator = new NlpEntityOperator(nlpEntityPredicate); nlpEntityOperator.setInputOperator(scanSource); nlpEntityOperator.setLimit(limit); nlpEntityOperator.setOffset(offset); Tuple nextTuple = null; List<Tuple> results = new ArrayList<Tuple>(); nlpEntityOperator.open(); while ((nextTuple = nlpEntityOperator.getNextTuple()) != null) { results.add(nextTuple); } nlpEntityOperator.close(); return results; }
public static void matchNLP(String tableName, NlpEntityType tokenType) throws Exception { List<String> attributeNames = Arrays.asList(MedlineIndexWriter.ABSTRACT); ISourceOperator sourceOperator = new ScanBasedSourceOperator(new ScanSourcePredicate(tableName)); NlpEntityPredicate nlpEntityPredicate = new NlpEntityPredicate(tokenType, attributeNames, SchemaConstants.SPAN_LIST); NlpEntityOperator nlpEntityOperator = new NlpEntityOperator(nlpEntityPredicate); nlpEntityOperator.setInputOperator(sourceOperator); long startMatchTime = System.currentTimeMillis(); nlpEntityOperator.open(); Tuple nextTuple = null; int counter = 0; while ((nextTuple = nlpEntityOperator.getNextTuple()) != null) { ListField<Span> spanListField = nextTuple.getField(SchemaConstants.SPAN_LIST); List<Span> spanList = spanListField.getValue(); counter += spanList.size(); } nlpEntityOperator.close(); long endMatchTime = System.currentTimeMillis(); double matchTime = (endMatchTime - startMatchTime) / 1000.0; totalMatchingTime += matchTime; totalResults += counter; }
public void getNextTupleTestWithLimitOffset() throws Exception { List<Tuple> data = NlpEntityTestConstants.getOneSentenceTestTuple(); DataWriter oneSentenceDataWriter = RelationManager.getInstance().getTableDataWriter(ONE_SENTENCE_TABLE); oneSentenceDataWriter.open(); for (Tuple tuple : data) { oneSentenceDataWriter.insertTuple(tuple); } oneSentenceDataWriter.close(); String attribute1 = NlpEntityTestConstants.SENTENCE_ONE; List<String> attributeNames = Arrays.asList(attribute1); List<Tuple> returnedResults = getQueryResults(ONE_SENTENCE_TABLE, attributeNames, NlpEntityType.NE_ALL, 2, 2); List<Tuple> expectedResults = NlpEntityTestConstants.getTest10ResultTuples(); Assert.assertEquals(returnedResults.size(), 2); Assert.assertTrue(TestUtils.containsAll(expectedResults, returnedResults)); }
@Override protected void setUp() throws TexeraException { inputSchema = inputOperator.getOutputSchema(); Schema.checkAttributeExists(inputSchema, predicate.getAttributeNames()); Schema.checkAttributeNotExists(inputSchema, predicate.getResultAttribute()); outputSchema = transformToOutputSchema(inputSchema); }
@Override public Tuple processOneInputTuple(Tuple inputTuple) throws TexeraException { List<Span> matchingResults = new ArrayList<>(); for (String attributeName : predicate.getAttributeNames()) { IField field = inputTuple.getField(attributeName); matchingResults.addAll(extractNlpSpans(field, attributeName)); } if (matchingResults.isEmpty()) { return null; } return new Tuple.Builder(inputTuple) .add(predicate.getResultAttribute(), AttributeType.LIST, new ListField<Span>(matchingResults)) .build(); }
@Test public void getNextTupleTest8() throws Exception { List<Tuple> data = NlpEntityTestConstants.getTest8Tuple(); DataWriter oneSentenceDataWriter = RelationManager.getInstance().getTableDataWriter(ONE_SENTENCE_TABLE); oneSentenceDataWriter.open(); for (Tuple tuple : data) { oneSentenceDataWriter.insertTuple(tuple); } oneSentenceDataWriter.close(); String attribute1 = NlpEntityTestConstants.SENTENCE_ONE; List<String> attributeNames = new ArrayList<>(); attributeNames.add(attribute1); List<Tuple> returnedResults = getQueryResults(ONE_SENTENCE_TABLE, attributeNames, NlpEntityType.MONEY); List<Tuple> expectedResults = NlpEntityTestConstants.getTest8ResultTuples(); boolean contains = TestUtils.equals(expectedResults, returnedResults); Assert.assertTrue(contains); }
@Test public void getNextTupleTest11() throws Exception { List<Tuple> data = NlpEntityTestConstants.getTwoSentenceTestTuple(); DataWriter twoSentenceDataWriter = RelationManager.getInstance().getTableDataWriter(TWO_SENTENCE_TABLE); twoSentenceDataWriter.open(); for (Tuple tuple : data) { twoSentenceDataWriter.insertTuple(tuple); } twoSentenceDataWriter.close(); String attribute1 = NlpEntityTestConstants.SENTENCE_ONE; String attribute2 = NlpEntityTestConstants.SENTENCE_TWO; List<String> attributeNames = Arrays.asList(attribute1, attribute2); List<Tuple> returnedResults = getQueryResults(TWO_SENTENCE_TABLE, attributeNames, NlpEntityType.NE_ALL); List<Tuple> expectedResults = NlpEntityTestConstants.getTest11ResultTuple(); boolean contains = TestUtils.equals(expectedResults, returnedResults); Assert.assertTrue(contains); }
@Test public void getNextTupleTest9() throws Exception { List<Tuple> data = NlpEntityTestConstants.getTest9Tuple(); DataWriter twoSentenceDataWriter = RelationManager.getInstance().getTableDataWriter(TWO_SENTENCE_TABLE); twoSentenceDataWriter.open(); for (Tuple tuple : data) { twoSentenceDataWriter.insertTuple(tuple); } twoSentenceDataWriter.close(); String attribute1 = NlpEntityTestConstants.SENTENCE_ONE; String attribute2 = NlpEntityTestConstants.SENTENCE_TWO; List<String> attributeNames = new ArrayList<>(); attributeNames.add(attribute1); attributeNames.add(attribute2); List<Tuple> returnedResults = getQueryResults(TWO_SENTENCE_TABLE, attributeNames, NlpEntityType.NE_ALL); List<Tuple> expectedResults = NlpEntityTestConstants.getTest9ResultTuples(); boolean contains = TestUtils.equals(expectedResults, returnedResults); Assert.assertTrue(contains); }
/** * Scenario 1: Test getNextTuple with only one span in the return list Text * : Microsoft is a organization. Search for all NE_ALL entity types * * @throws Exception */ @Test public void getNextTupleTest1() throws Exception { List<Tuple> data = NlpEntityTestConstants.getTest1Tuple(); DataWriter oneSentenceDataWriter = RelationManager.getInstance().getTableDataWriter(ONE_SENTENCE_TABLE); oneSentenceDataWriter.open(); for (Tuple tuple : data) { oneSentenceDataWriter.insertTuple(tuple); } oneSentenceDataWriter.close(); String attribute1 = NlpEntityTestConstants.SENTENCE_ONE; List<String> attributeNames = new ArrayList<>(); attributeNames.add(attribute1); List<Tuple> returnedResults = getQueryResults(ONE_SENTENCE_TABLE, attributeNames, NlpEntityType.NE_ALL); List<Tuple> expectedResults = NlpEntityTestConstants.getTest1ResultTuples(); boolean contains = TestUtils.equals(expectedResults, returnedResults); Assert.assertTrue(contains); }
/** * Scenario 5:Test getNextTuple using two fields: * <p> * Sentence1: Microsoft, Google and Facebook are organizations. Sentence2: * Donald Trump and Barack Obama are persons. * <p> * Only search the second field for all NE_ALL entity types */ @Test public void getNextTupleTest5() throws Exception { List<Tuple> data = NlpEntityTestConstants.getTest4Tuple(); DataWriter twoSentenceDataWriter = RelationManager.getInstance().getTableDataWriter(TWO_SENTENCE_TABLE); twoSentenceDataWriter.open(); for (Tuple tuple : data) { twoSentenceDataWriter.insertTuple(tuple); } twoSentenceDataWriter.close(); String attribute = NlpEntityTestConstants.SENTENCE_TWO; List<String> attributeNames = new ArrayList<>(); attributeNames.add(attribute); List<Tuple> returnedResults = getQueryResults(TWO_SENTENCE_TABLE, attributeNames, NlpEntityType.NE_ALL); List<Tuple> expectedResults = NlpEntityTestConstants.getTest5ResultTuples(); boolean contains = TestUtils.equals(expectedResults, returnedResults); Assert.assertTrue(contains); }
public Schema transformToOutputSchema(Schema... inputSchema) { if (inputSchema.length != 1) throw new TexeraException(String.format(ErrorMessages.NUMBER_OF_ARGUMENTS_DOES_NOT_MATCH, 1, inputSchema.length)); Schema.checkAttributeExists(inputSchema[0], predicate.getAttributeNames()); Schema.checkAttributeNotExists(inputSchema[0], predicate.getResultAttribute()); return new Schema.Builder().add(inputSchema[0]).add(predicate.getResultAttribute(), AttributeType.LIST).build(); } }
@Test public void testNlpEntity() throws Exception { NlpEntityPredicate nlpEntityPredicate = new NlpEntityPredicate( NlpEntityType.LOCATION, attributeNames, "nlpEntityResults"); testPredicate(nlpEntityPredicate); }
/** * @param NLP_TEST_TABLE * @param attributes * @param nlpEntityType * @return * @throws Exception * @about Using nlpEntityOperator to get all returned results from * NLP_TEST_TABLE, return as a list of tuples */ public List<Tuple> getQueryResults(String tableName, List<String> attributeNames, NlpEntityType nlpEntityType) throws Exception { return getQueryResults(tableName, attributeNames, nlpEntityType, Integer.MAX_VALUE, 0); }
@Override protected Tuple computeNextMatchingTuple() throws TexeraException { Tuple inputTuple = null; Tuple resultTuple = null; while ((inputTuple = inputOperator.getNextTuple()) != null) { resultTuple = processOneInputTuple(inputTuple); if (resultTuple != null) { break; } } return resultTuple; }
public static void setDefaultID() { keywordSourcePredicate.setID(KEYWORD_SOURCE_ID); regexPredicate.setID(REGEX_ID); fuzzyTokenPredicate.setID(FUZZY_TOKEN_ID); nlpEntityPredicate.setID(NLP_ENTITY_ID); joinDistancePredicate.setID(JOIN_DISTANCE_ID); tupleSinkPredicate.setID(TUPLE_SINK_ID); }
public void getNextTupleTestWithLimit() throws Exception { List<Tuple> data = NlpEntityTestConstants.getOneSentenceTestTuple(); DataWriter oneSentenceDataWriter = RelationManager.getInstance().getTableDataWriter(ONE_SENTENCE_TABLE); oneSentenceDataWriter.open(); for (Tuple tuple : data) { oneSentenceDataWriter.insertTuple(tuple); } oneSentenceDataWriter.close(); String attribute1 = NlpEntityTestConstants.SENTENCE_ONE; List<String> attributeNames = Arrays.asList(attribute1); List<Tuple> returnedResults = getQueryResults(ONE_SENTENCE_TABLE, attributeNames, NlpEntityType.NE_ALL, 3, 0); List<Tuple> expectedResults = NlpEntityTestConstants.getTest10ResultTuples(); // ExpectedResults is the array containing all the matches. // Since the order of returning records in returnedResults is not deterministic, we use containsAll // to ensure that the records in returnedResults are included in the ExpectedResults. Assert.assertEquals(returnedResults.size(), 3); Assert.assertTrue(TestUtils.containsAll(expectedResults, returnedResults)); }
@Test public void getNextTupleTest10() throws Exception { List<Tuple> data = NlpEntityTestConstants.getOneSentenceTestTuple(); DataWriter oneSentenceDataWriter = RelationManager.getInstance().getTableDataWriter(ONE_SENTENCE_TABLE); oneSentenceDataWriter.open(); for (Tuple tuple : data) { oneSentenceDataWriter.insertTuple(tuple); } oneSentenceDataWriter.close(); String attribute1 = NlpEntityTestConstants.SENTENCE_ONE; List<String> attributeNames = Arrays.asList(attribute1); List<Tuple> returnedResults = getQueryResults(ONE_SENTENCE_TABLE, attributeNames, NlpEntityType.NE_ALL); List<Tuple> expectedResults = NlpEntityTestConstants.getTest10ResultTuples(); boolean contains = TestUtils.equals(expectedResults, returnedResults); Assert.assertTrue(contains); }