public static IkAnalyzerProvider getIkSmartAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { return new IkAnalyzerProvider(indexSettings,env,name,settings,true); }
public static IkTokenizerFactory getIkTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { return new IkTokenizerFactory(indexSettings,env, name, settings).setSmart(false); }
public static PinyinAlphabetDict getInstance() { if (instance == null) { synchronized (PinyinAlphabetDict.class) { if (instance == null) { instance = new PinyinAlphabetDict(); } } } return instance; }
private int parseBuff(StringBuilder buff, int buffSize, int buffPosition) { if (config.keepNoneChinese) { if (config.noneChinesePinyinTokenize) { List<String> result = PinyinAlphabetTokenizer.walk(buff.toString()); int start = (lastOffset - buffSize + 1); for (int i = 0; i < result.size(); i++) { int end; String t = result.get(i); if (config.fixedPinyinOffset) { end = start + 1; } else { end = start + t.length(); } addCandidate(new TermItem(result.get(i), start, end, ++position)); start = end; } } else if (config.keepFirstLetter || config.keepSeparateFirstLetter || config.keepFullPinyin || !config.keepNoneChineseInJoinedFullPinyin) { addCandidate(new TermItem(buff.toString(), lastOffset - buffSize, lastOffset, ++position)); } } buff.setLength(0); buffSize = 0; return buffSize; }
private int parseBuff(StringBuilder buff, int buffSize, int buffPosition) { if (config.keepNoneChinese) { if (config.noneChinesePinyinTokenize) { List<String> result = PinyinAlphabetTokenizer.walk(buff.toString()); int start = (lastOffset - buffSize + 1); for (int i = 0; i < result.size(); i++) { int end; String t = result.get(i); if (config.fixedPinyinOffset) { end = start + 1; } else { end = start + t.length(); } addCandidate(new TermItem(result.get(i), start, end, ++position)); start = end; } } else if (config.keepFirstLetter || config.keepSeparateFirstLetter || config.keepFullPinyin || !config.keepNoneChineseInJoinedFullPinyin) { addCandidate(new TermItem(buff.toString(), lastOffset - buffSize, lastOffset, ++position)); } } buff.setLength(0); buffSize = 0; return buffSize; }
private static List<String> segPinyinStr(String content) { String pinyinStr = content; pinyinStr = pinyinStr.toLowerCase(); // 按非letter切分 List<String> pinyinStrList = splitByNoletter(pinyinStr); List<String> pinyinList = new ArrayList<>(); for (String pinyinText : pinyinStrList) { if (pinyinText.length() == 1) { pinyinList.add(pinyinText); } else { List<String> forward = positiveMaxMatch(pinyinText, PINYIN_MAX_LENGTH); if (forward.size() == 1) { // 前向只切出1个的话,没有必要再做逆向分词 pinyinList.addAll(forward); } else { // 分别正向、逆向最大匹配,选出最短的作为最优结果 List<String> backward = reverseMaxMatch(pinyinText, PINYIN_MAX_LENGTH); if (forward.size() <= backward.size()) { pinyinList.addAll(forward); } else { pinyinList.addAll(backward); } } } } return pinyinList; }
@Override public final boolean incrementToken() throws IOException { if (!done) { if (readTerm()) return true; } if (done) { resetVariable(); if (!input.incrementToken()) { return false; } done = false; } readTerm(); return true; }
for (int j = 0; j < sixStr.length(); j++) { String guess = sixStr.substring(0, sixStr.length() - j); if (PinyinAlphabetDict.getInstance().match(guess)) { pinyinList.add(guess); start += guess.length();
@Override public Tokenizer create() { return new PinyinTokenizer(config); } }
public PinyinTokenFilter(TokenStream in, PinyinConfig config) { super(in); this.config = config; //validate config if (!(config.keepFirstLetter || config.keepSeparateFirstLetter || config.keepFullPinyin || config.keepJoinedFullPinyin)) { throw new ConfigErrorException("pinyin config error, can't disable separate_first_letter, first_letter and full_pinyin at the same time."); } candidate = new ArrayList<>(); firstLetters = new StringBuilder(); termsFilter = new HashSet<>(); fullPinyinLetters = new StringBuilder(); }
@Override public TokenStream create(TokenStream tokenStream) { return new PinyinTokenFilter(tokenStream, config); } }
public static List<String> walk(String text) { return segPinyinStr(text); }
@Override public void reset() throws IOException { super.reset(); this.done = true; resetVariable(); }
@Inject public PinyinAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); config=new PinyinConfig(settings); analyzer = new PinyinAnalyzer(config); }
void setTerm(String term, int startOffset, int endOffset, int position) { if (config.lowercase) { term = term.toLowerCase(); } if (config.trimWhitespace) { term = term.trim(); } //ignore empty term if(term.length()==0){ return; } termAtt.setEmpty(); termAtt.append(term); if (startOffset < 0) { startOffset = 0; } if (endOffset < startOffset) { endOffset = startOffset + term.length(); } if(!config.ignorePinyinOffset){ offsetAtt.setOffset(correctOffset(startOffset), correctOffset(endOffset)); } int offset = position - lastIncrementPosition; if (offset < 0) { offset = 0; } positionAttr.setPositionIncrement(offset); lastIncrementPosition = position; }
public static IkTokenizerFactory getIkSmartTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { return new IkTokenizerFactory(indexSettings,env, name, settings).setSmart(true); }
@Override protected TokenStreamComponents createComponents(String fieldName) { return new TokenStreamComponents(new PinyinTokenizer(config)); }
public static IkAnalyzerProvider getIkAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { return new IkAnalyzerProvider(indexSettings,env,name,settings,false); }
public PinyinTokenizer(PinyinConfig config) { this(DEFAULT_BUFFER_SIZE); this.config = config; //validate config if (!(config.keepFirstLetter || config.keepSeparateFirstLetter || config.keepFullPinyin || config.keepJoinedFullPinyin)) { throw new ConfigErrorException("pinyin config error, can't disable separate_first_letter, first_letter and full_pinyin at the same time."); } candidate = new ArrayList<>(); termsFilter = new HashSet<>(); firstLetters = new StringBuilder(); fullPinyinLetters = new StringBuilder(); }
@Override public Tokenizer create() { PinyinConfig config=new PinyinConfig(); config.keepFirstLetter=true; config.keepFullPinyin=false; config.keepNoneChinese=false; config.keepNoneChineseTogether=true; config.noneChinesePinyinTokenize=false; config.keepOriginal=false; config.lowercase=true; config.trimWhitespace=true; config.keepNoneChineseInFirstLetter=true; return new PinyinTokenizer(config); } }