org.apache.lucene.analysis.synonym java code examples

@Override
public void add(CharsRef input, CharsRef output, boolean includeOrig) {
  // This condition follows up on the overridden analyze method. In case lenient was set to true and there was an
  // exception during super.analyze we return a zero-length CharsRef for that word which caused an exception. When
  // the synonym mappings for the words are added using the add method we skip the ones that were left empty by
  // analyze i.e., in the case when lenient is set we only add those combinations which are non-zero-length. The
  // else would happen only in the case when the input or output is empty and lenient is set, in which case we
  // quietly ignore it. For more details on the control-flow see SolrSynonymParser::addInternal.
  if (lenient == false || (input.length > 0 && output.length > 0)) {
    super.add(input, output, includeOrig);
  }
}

@Override
public TokenStream create(TokenStream input) {
 // if the fst is null, it means there's actually no synonyms... just return the original stream
 // as there is nothing to do here.
 return map.fst == null ? input : new SynonymFilter(input, map, ignoreCase);
}

@Override
public TokenStream create(TokenStream input) {
 // if the fst is null, it means there's actually no synonyms... just return the original stream
 // as there is nothing to do here.
 return map.fst == null ? input : new SynonymGraphFilter(input, map, ignoreCase);
}

/**
 * Add a phrase-&gt;phrase synonym mapping.
 * Phrases are character sequences where words are
 * separated with character zero (U+0000).  Empty words
 * (two U+0000s in a row) are not allowed in the input nor
 * the output!
 * 
 * @param input input phrase
 * @param output output phrase
 * @param includeOrig true if the original should be included
 */
public void add(CharsRef input, CharsRef output, boolean includeOrig) {
 add(input, countWords(input), output, countWords(output), includeOrig);
}

@Override
public void add(CharsRef input, CharsRef output, boolean includeOrig) {
  // This condition follows up on the overridden analyze method. In case lenient was set to true and there was an
  // exception during super.analyze we return a zero-length CharsRef for that word which caused an exception. When
  // the synonym mappings for the words are added using the add method we skip the ones that were left empty by
  // analyze i.e., in the case when lenient is set we only add those combinations which are non-zero-length. The
  // else would happen only in the case when the input or output is empty and lenient is set, in which case we
  // quietly ignore it. For more details on the control-flow see SolrSynonymParser::addInternal.
  if (lenient == false || (input.length > 0 && output.length > 0)) {
    super.add(input, output, includeOrig);
  }
}

protected SynonymMap buildSynonyms(Analyzer analyzer, Reader rules) {
  try {
    SynonymMap.Builder parser;
    if ("wordnet".equalsIgnoreCase(format)) {
      parser = new ESWordnetSynonymParser(true, expand, lenient, analyzer);
      ((ESWordnetSynonymParser) parser).parse(rules);
    } else {
      parser = new ESSolrSynonymParser(true, expand, lenient, analyzer);
      ((ESSolrSynonymParser) parser).parse(rules);
    }
    return parser.build();
  } catch (Exception e) {
    throw new IllegalArgumentException("failed to build synonyms", e);
  }
}

 @Override
 public void reset() throws IOException {
  super.reset();
  captureCount = 0;
  finished = false;
  inputSkipCount = 0;
  nextRead = nextWrite = 0;

  // In normal usage these resets would not be needed,
  // since they reset-as-they-are-consumed, but the app
  // may not consume all input tokens (or we might hit an
  // exception), in which case we have leftover state
  // here:
  for (PendingInput input : futureInputs) {
   input.reset();
  }
  for (PendingOutputs output : futureOutputs) {
   output.reset();
  }
 }
}

private CharsRef parseSynonym(String line, CharsRefBuilder reuse) throws IOException {
 if (reuse == null) {
  reuse = new CharsRefBuilder();
 }
 
 int start = line.indexOf('\'')+1;
 int end = line.lastIndexOf('\'');
 
 String text = line.substring(start, end).replace("''", "'");
 return analyze(text, reuse);
}

/**
 * @param input input tokenstream
 * @param synonyms synonym map
 * @param ignoreCase case-folds input for matching with {@link Character#toLowerCase(int)}.
 *                   Note, if you set this to true, it's your responsibility to lowercase
 *                   the input entries when you create the {@link SynonymMap}
 */
public SynonymFilter(TokenStream input, SynonymMap synonyms, boolean ignoreCase) {
 super(input);
 this.synonyms = synonyms;
 this.ignoreCase = ignoreCase;
 this.fst = synonyms.fst;
 if (fst == null) {
  throw new IllegalArgumentException("fst must be non-null");
 }
 this.fstReader = fst.getBytesReader();
 // Must be 1+ so that when roll buffer is at full
 // lookahead we can distinguish this full buffer from
 // the empty buffer:
 rollBufferSize = 1+synonyms.maxHorizontalContext;
 futureInputs = new PendingInput[rollBufferSize];
 futureOutputs = new PendingOutputs[rollBufferSize];
 for(int pos=0;pos<rollBufferSize;pos++) {
  futureInputs[pos] = new PendingInput();
  futureOutputs[pos] = new PendingOutputs();
 }
 //System.out.println("FSTFilt maxH=" + synonyms.maxHorizontalContext);
 scratchArc = new FST.Arc<>();
}

private void capture() {
 captureCount++;
 //System.out.println("  capture slot=" + nextWrite);
 final PendingInput input = futureInputs[nextWrite];
 input.state = captureState();
 input.consumed = false;
 input.term.copyChars(termAtt.buffer(), 0, termAtt.length());
 nextWrite = rollIncr(nextWrite);
 // Buffer head should never catch up to tail:
 assert nextWrite != nextRead;
}

  @Override
  public CharsRef analyze(String text, CharsRefBuilder reuse) throws IOException {
    try {
      return super.analyze(text, reuse);
    } catch (IllegalArgumentException ex) {
      if (lenient) {
        logger.info("Synonym rule for [" + text + "] was ignored");
        return new CharsRef("");
      } else {
        throw ex;
      }
    }
  }
}

/**
 * Load synonyms with the given {@link SynonymMap.Parser} class.
 */
protected SynonymMap loadSynonyms(ResourceLoader loader, String cname, boolean dedup, Analyzer analyzer) throws IOException, ParseException {
 CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder()
   .onMalformedInput(CodingErrorAction.REPORT)
   .onUnmappableCharacter(CodingErrorAction.REPORT);
 SynonymMap.Parser parser;
 Class<? extends SynonymMap.Parser> clazz = loader.findClass(cname, SynonymMap.Parser.class);
 try {
  parser = clazz.getConstructor(boolean.class, boolean.class, Analyzer.class).newInstance(dedup, expand, analyzer);
 } catch (Exception e) {
  throw new RuntimeException(e);
 }
 List<String> files = splitFileNames(synonyms);
 for (String file : files) {
  decoder.reset();
  parser.parse(new InputStreamReader(loader.openResource(file), decoder));
 }
 return parser.build();
}

public CharsRef pullNext() {
 assert upto < count;
 lastEndOffset = endOffsets[upto];
 lastPosLength = posLengths[upto];
 final CharsRefBuilder result = outputs[upto++];
 posIncr = 0;
 if (upto == count) {
  reset();
 }
 return result.get();
}

@Override
public void parse(Reader in) throws IOException, ParseException {
 LineNumberReader br = new LineNumberReader(in);
 try {
  addInternal(br);
 } catch (IllegalArgumentException e) {
  ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0);
  ex.initCause(e);
  throw ex;
 } finally {
  br.close();
 }
}

 @Override
 protected BufferedInputToken newInstance() {
  return new BufferedInputToken();
 }
};

/** Buffers the current input token into lookahead buffer. */
private void capture() {
 assert liveToken;
 liveToken = false;
 BufferedInputToken token = lookahead.get(lookaheadNextWrite);
 lookaheadNextWrite++;
 token.state = captureState();
 token.startOffset = offsetAtt.startOffset();
 token.endOffset = offsetAtt.endOffset();
 assert token.term.length() == 0;
 token.term.append(termAtt);
 captureCount++;
 maxLookaheadUsed = Math.max(maxLookaheadUsed, lookahead.getBufferSize());
 //System.out.println("  maxLookaheadUsed=" + maxLookaheadUsed);
}

@Override
public TokenStream create(TokenStream tokenStream) {
  return synonyms.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonyms, false);
}

 private void addInternal(CharsRef synset[], int size) {
  if (size <= 1) {
   return; // nothing to do
  }
  
  if (expand) {
   for (int i = 0; i < size; i++) {
    for (int j = 0; j < size; j++) {
     add(synset[i], synset[j], false);
    }
   }
  } else {
   for (int i = 0; i < size; i++) {
    add(synset[i], synset[0], false);
   }
  }
 }
}

  @Override
  public TokenStream create(TokenStream tokenStream) {
    // fst is null means no synonyms
    return synonyms.fst == null ? tokenStream : new SynonymGraphFilter(tokenStream, synonyms, ignoreCase);
  }
};

  @Override
  public CharsRef analyze(String text, CharsRefBuilder reuse) throws IOException {
    try {
      return super.analyze(text, reuse);
    } catch (IllegalArgumentException ex) {
      if (lenient) {
        logger.info("Synonym rule for [" + text + "] was ignored");
        return new CharsRef("");
      } else {
        throw ex;
      }
    }
  }
}

Most used classes

SolrSynonymParser
Parser for the Solr synonyms format. 1. Blank lines and lines starting with '#' are comments.
SynonymMap$Builder
Builds an FSTSynonymMap. Call add() until you have added all the mappings, then call build() to get
SynonymFilter
Matches single or multi word synonyms in a token stream. This token stream cannot properly handle po
WordnetSynonymParser
Parser for wordnet prolog format See http://wordnet.princeton.edu/man/prologdb.5WN.html for a descri
SynonymGraphFilter
Applies single- or multi-token synonyms from a SynonymMapto an incoming TokenStream, producing a ful

How to use org.apache.lucene.analysis.synonym

Best Java code snippets using org.apache.lucene.analysis.synonym (Showing top 20 results out of 315)