@Override public void add(CharsRef input, CharsRef output, boolean includeOrig) { // This condition follows up on the overridden analyze method. In case lenient was set to true and there was an // exception during super.analyze we return a zero-length CharsRef for that word which caused an exception. When // the synonym mappings for the words are added using the add method we skip the ones that were left empty by // analyze i.e., in the case when lenient is set we only add those combinations which are non-zero-length. The // else would happen only in the case when the input or output is empty and lenient is set, in which case we // quietly ignore it. For more details on the control-flow see SolrSynonymParser::addInternal. if (lenient == false || (input.length > 0 && output.length > 0)) { super.add(input, output, includeOrig); } }
@Override public TokenStream create(TokenStream input) { // if the fst is null, it means there's actually no synonyms... just return the original stream // as there is nothing to do here. return map.fst == null ? input : new SynonymFilter(input, map, ignoreCase); }
@Override public TokenStream create(TokenStream input) { // if the fst is null, it means there's actually no synonyms... just return the original stream // as there is nothing to do here. return map.fst == null ? input : new SynonymGraphFilter(input, map, ignoreCase); }
/** * Add a phrase->phrase synonym mapping. * Phrases are character sequences where words are * separated with character zero (U+0000). Empty words * (two U+0000s in a row) are not allowed in the input nor * the output! * * @param input input phrase * @param output output phrase * @param includeOrig true if the original should be included */ public void add(CharsRef input, CharsRef output, boolean includeOrig) { add(input, countWords(input), output, countWords(output), includeOrig); }
@Override public void add(CharsRef input, CharsRef output, boolean includeOrig) { // This condition follows up on the overridden analyze method. In case lenient was set to true and there was an // exception during super.analyze we return a zero-length CharsRef for that word which caused an exception. When // the synonym mappings for the words are added using the add method we skip the ones that were left empty by // analyze i.e., in the case when lenient is set we only add those combinations which are non-zero-length. The // else would happen only in the case when the input or output is empty and lenient is set, in which case we // quietly ignore it. For more details on the control-flow see SolrSynonymParser::addInternal. if (lenient == false || (input.length > 0 && output.length > 0)) { super.add(input, output, includeOrig); } }
protected SynonymMap buildSynonyms(Analyzer analyzer, Reader rules) { try { SynonymMap.Builder parser; if ("wordnet".equalsIgnoreCase(format)) { parser = new ESWordnetSynonymParser(true, expand, lenient, analyzer); ((ESWordnetSynonymParser) parser).parse(rules); } else { parser = new ESSolrSynonymParser(true, expand, lenient, analyzer); ((ESSolrSynonymParser) parser).parse(rules); } return parser.build(); } catch (Exception e) { throw new IllegalArgumentException("failed to build synonyms", e); } }
@Override public void reset() throws IOException { super.reset(); captureCount = 0; finished = false; inputSkipCount = 0; nextRead = nextWrite = 0; // In normal usage these resets would not be needed, // since they reset-as-they-are-consumed, but the app // may not consume all input tokens (or we might hit an // exception), in which case we have leftover state // here: for (PendingInput input : futureInputs) { input.reset(); } for (PendingOutputs output : futureOutputs) { output.reset(); } } }
private CharsRef parseSynonym(String line, CharsRefBuilder reuse) throws IOException { if (reuse == null) { reuse = new CharsRefBuilder(); } int start = line.indexOf('\'')+1; int end = line.lastIndexOf('\''); String text = line.substring(start, end).replace("''", "'"); return analyze(text, reuse); }
/** * @param input input tokenstream * @param synonyms synonym map * @param ignoreCase case-folds input for matching with {@link Character#toLowerCase(int)}. * Note, if you set this to true, it's your responsibility to lowercase * the input entries when you create the {@link SynonymMap} */ public SynonymFilter(TokenStream input, SynonymMap synonyms, boolean ignoreCase) { super(input); this.synonyms = synonyms; this.ignoreCase = ignoreCase; this.fst = synonyms.fst; if (fst == null) { throw new IllegalArgumentException("fst must be non-null"); } this.fstReader = fst.getBytesReader(); // Must be 1+ so that when roll buffer is at full // lookahead we can distinguish this full buffer from // the empty buffer: rollBufferSize = 1+synonyms.maxHorizontalContext; futureInputs = new PendingInput[rollBufferSize]; futureOutputs = new PendingOutputs[rollBufferSize]; for(int pos=0;pos<rollBufferSize;pos++) { futureInputs[pos] = new PendingInput(); futureOutputs[pos] = new PendingOutputs(); } //System.out.println("FSTFilt maxH=" + synonyms.maxHorizontalContext); scratchArc = new FST.Arc<>(); }
private void capture() { captureCount++; //System.out.println(" capture slot=" + nextWrite); final PendingInput input = futureInputs[nextWrite]; input.state = captureState(); input.consumed = false; input.term.copyChars(termAtt.buffer(), 0, termAtt.length()); nextWrite = rollIncr(nextWrite); // Buffer head should never catch up to tail: assert nextWrite != nextRead; }
@Override public CharsRef analyze(String text, CharsRefBuilder reuse) throws IOException { try { return super.analyze(text, reuse); } catch (IllegalArgumentException ex) { if (lenient) { logger.info("Synonym rule for [" + text + "] was ignored"); return new CharsRef(""); } else { throw ex; } } } }
/** * Load synonyms with the given {@link SynonymMap.Parser} class. */ protected SynonymMap loadSynonyms(ResourceLoader loader, String cname, boolean dedup, Analyzer analyzer) throws IOException, ParseException { CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder() .onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); SynonymMap.Parser parser; Class<? extends SynonymMap.Parser> clazz = loader.findClass(cname, SynonymMap.Parser.class); try { parser = clazz.getConstructor(boolean.class, boolean.class, Analyzer.class).newInstance(dedup, expand, analyzer); } catch (Exception e) { throw new RuntimeException(e); } List<String> files = splitFileNames(synonyms); for (String file : files) { decoder.reset(); parser.parse(new InputStreamReader(loader.openResource(file), decoder)); } return parser.build(); }
public CharsRef pullNext() { assert upto < count; lastEndOffset = endOffsets[upto]; lastPosLength = posLengths[upto]; final CharsRefBuilder result = outputs[upto++]; posIncr = 0; if (upto == count) { reset(); } return result.get(); }
@Override public void parse(Reader in) throws IOException, ParseException { LineNumberReader br = new LineNumberReader(in); try { addInternal(br); } catch (IllegalArgumentException e) { ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0); ex.initCause(e); throw ex; } finally { br.close(); } }
@Override protected BufferedInputToken newInstance() { return new BufferedInputToken(); } };
/** Buffers the current input token into lookahead buffer. */ private void capture() { assert liveToken; liveToken = false; BufferedInputToken token = lookahead.get(lookaheadNextWrite); lookaheadNextWrite++; token.state = captureState(); token.startOffset = offsetAtt.startOffset(); token.endOffset = offsetAtt.endOffset(); assert token.term.length() == 0; token.term.append(termAtt); captureCount++; maxLookaheadUsed = Math.max(maxLookaheadUsed, lookahead.getBufferSize()); //System.out.println(" maxLookaheadUsed=" + maxLookaheadUsed); }
@Override public TokenStream create(TokenStream tokenStream) { return synonyms.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonyms, false); }
private void addInternal(CharsRef synset[], int size) { if (size <= 1) { return; // nothing to do } if (expand) { for (int i = 0; i < size; i++) { for (int j = 0; j < size; j++) { add(synset[i], synset[j], false); } } } else { for (int i = 0; i < size; i++) { add(synset[i], synset[0], false); } } } }
@Override public TokenStream create(TokenStream tokenStream) { // fst is null means no synonyms return synonyms.fst == null ? tokenStream : new SynonymGraphFilter(tokenStream, synonyms, ignoreCase); } };
@Override public CharsRef analyze(String text, CharsRefBuilder reuse) throws IOException { try { return super.analyze(text, reuse); } catch (IllegalArgumentException ex) { if (lenient) { logger.info("Synonym rule for [" + text + "] was ignored"); return new CharsRef(""); } else { throw ex; } } } }