@Override protected void setUp() throws TexeraException { inputSchema = inputOperator.getOutputSchema(); Schema.checkAttributeExists(inputSchema, predicate.getAttributeNames()); Schema.checkAttributeNotExists(inputSchema, predicate.getResultAttribute()); outputSchema = transformToOutputSchema(inputSchema); }
@JsonCreator public NlpEntityPredicate( @JsonProperty(value = PropertyNameConstants.NLP_ENTITY_TYPE, required = true) NlpEntityType nlpEntityType, @JsonProperty(value = PropertyNameConstants.ATTRIBUTE_NAMES, required = true) List<String> attributeNames, @JsonProperty(value = PropertyNameConstants.RESULT_ATTRIBUTE_NAME, required = true) String resultAttribute) { if (attributeNames.isEmpty()) { throw new TexeraException("attributes should not be empty"); } this.nlpEntityType = nlpEntityType; this.attributeNames = attributeNames; if (resultAttribute == null || resultAttribute.trim().isEmpty()) { this.resultAttribute = this.getID(); } else { this.resultAttribute = resultAttribute; } }
@Test public void testNlpEntity() throws Exception { NlpEntityPredicate nlpEntityPredicate = new NlpEntityPredicate( NlpEntityType.LOCATION, attributeNames, "nlpEntityResults"); testPredicate(nlpEntityPredicate); }
if (getNlpTypeIndicator(predicate.getNlpEntityType()).equals("POS")) { props.setProperty("annotators", "tokenize, ssplit, pos"); if (posPipeline == null) { if (getNlpTypeIndicator(predicate.getNlpEntityType()).equals("POS")) { stanfordNlpConstant = token.get(CoreAnnotations.PartOfSpeechAnnotation.class); } else { continue; if (predicate.getNlpEntityType().equals(NlpEntityType.NE_ALL) || predicate.getNlpEntityType().equals(nlpEntityType)) { int start = token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class); int end = token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class); if (spanList.size() >= 1 && (getNlpTypeIndicator(predicate.getNlpEntityType()).equals("NE_ALL"))) { Span previousSpan = spanList.get(spanList.size() - 1); if (previousSpan.getAttributeName().equals(span.getAttributeName())
public static void setDefaultID() { keywordSourcePredicate.setID(KEYWORD_SOURCE_ID); regexPredicate.setID(REGEX_ID); fuzzyTokenPredicate.setID(FUZZY_TOKEN_ID); nlpEntityPredicate.setID(NLP_ENTITY_ID); joinDistancePredicate.setID(JOIN_DISTANCE_ID); tupleSinkPredicate.setID(TUPLE_SINK_ID); }
public static void matchNLP(String tableName, NlpEntityType tokenType) throws Exception { List<String> attributeNames = Arrays.asList(MedlineIndexWriter.ABSTRACT); ISourceOperator sourceOperator = new ScanBasedSourceOperator(new ScanSourcePredicate(tableName)); NlpEntityPredicate nlpEntityPredicate = new NlpEntityPredicate(tokenType, attributeNames, SchemaConstants.SPAN_LIST); NlpEntityOperator nlpEntityOperator = new NlpEntityOperator(nlpEntityPredicate); nlpEntityOperator.setInputOperator(sourceOperator); long startMatchTime = System.currentTimeMillis(); nlpEntityOperator.open(); Tuple nextTuple = null; int counter = 0; while ((nextTuple = nlpEntityOperator.getNextTuple()) != null) { ListField<Span> spanListField = nextTuple.getField(SchemaConstants.SPAN_LIST); List<Span> spanList = spanListField.getValue(); counter += spanList.size(); } nlpEntityOperator.close(); long endMatchTime = System.currentTimeMillis(); double matchTime = (endMatchTime - startMatchTime) / 1000.0; totalMatchingTime += matchTime; totalResults += counter; }
public Schema transformToOutputSchema(Schema... inputSchema) { if (inputSchema.length != 1) throw new TexeraException(String.format(ErrorMessages.NUMBER_OF_ARGUMENTS_DOES_NOT_MATCH, 1, inputSchema.length)); Schema.checkAttributeExists(inputSchema[0], predicate.getAttributeNames()); Schema.checkAttributeNotExists(inputSchema[0], predicate.getResultAttribute()); return new Schema.Builder().add(inputSchema[0]).add(predicate.getResultAttribute(), AttributeType.LIST).build(); } }
public List<Tuple> getQueryResults(String tableName, List<String> attributeNames, NlpEntityType nlpEntityType, int limit, int offset) throws Exception { ScanBasedSourceOperator scanSource = new ScanBasedSourceOperator(new ScanSourcePredicate(tableName)); NlpEntityPredicate nlpEntityPredicate = new NlpEntityPredicate(nlpEntityType, attributeNames, RESULTS); NlpEntityOperator nlpEntityOperator = new NlpEntityOperator(nlpEntityPredicate); nlpEntityOperator.setInputOperator(scanSource); nlpEntityOperator.setLimit(limit); nlpEntityOperator.setOffset(offset); Tuple nextTuple = null; List<Tuple> results = new ArrayList<Tuple>(); nlpEntityOperator.open(); while ((nextTuple = nlpEntityOperator.getNextTuple()) != null) { results.add(nextTuple); } nlpEntityOperator.close(); return results; }
@Override public Tuple processOneInputTuple(Tuple inputTuple) throws TexeraException { List<Span> matchingResults = new ArrayList<>(); for (String attributeName : predicate.getAttributeNames()) { IField field = inputTuple.getField(attributeName); matchingResults.addAll(extractNlpSpans(field, attributeName)); } if (matchingResults.isEmpty()) { return null; } return new Tuple.Builder(inputTuple) .add(predicate.getResultAttribute(), AttributeType.LIST, new ListField<Span>(matchingResults)) .build(); }