public String normalize(String text) { text = text.toLowerCase(); text = TextNormalizer.convertToUnicode(text); text = TextNormalizer.joinStrings(tokenizer.tokenize(text)); text = TextNormalizer.removePunctuations(text); text = text.replaceAll("\n", " ").trim(); return text; }
private String normalize(String text, Language language){ text = text.toLowerCase(); text = TextNormalizer.convertToUnicode(text); if(!langTokenizerMap.containsKey(language)) langTokenizerMap.put(language, TextNormalizer.getTokenizer(language)); List<String> tokens = langTokenizerMap.get(language).tokenize(text); text = TextNormalizer.joinStrings(tokens); text = TextNormalizer.removePunctuations(text); text = text.replaceAll("\n", " ").trim(); text = TextNormalizer.deAccent(text); return text; }
private String normalize(String text, Language language){ text = text.toLowerCase(); text = TextNormalizer.convertToUnicode(text); if(!langTokenizerMap.containsKey(language)) langTokenizerMap.put(language, TextNormalizer.getTokenizer(language)); List<String> tokens = langTokenizerMap.get(language).tokenize(text); text = TextNormalizer.joinStrings(tokens); text = TextNormalizer.removePunctuations(text); text = text.replaceAll("\n", " ").trim(); text = TextNormalizer.deAccent(text); return text; }
if (labels.containsKey(srcLang)) { for (String label : labels.get(srcLang)) { for (String token : tokenizer.tokenize(label.toLowerCase())) { ontoDocTokens.add(token);
private List<Translation> doDecoding(TranslatorSetup setup, Decoder decoder, EntityLabel el, int options) { final ChunkList chunkList = setup.chunker(el.entity).chunk(tokenizer.tokenize(el.srcLabel)); final PhraseTableImpl pt = new PhraseTableImpl(setup.sourceLanguage(), setup.targetLanguage(), "mert_table"); for (Chunk chunk : chunkList) { for (TranslationSource source : setup.sources()) { pt.addAll(source.candidates(chunk)); } } PhraseTable rerankedTable = pt; for (TranslationFeaturizer featurizer : setup.featurizers(el.entity)) { try { rerankedTable = featurizer.featurize(rerankedTable, el.entity); } catch (Exception x) { } } final List<Translation> decoded = (options & OntologyTranslator.DECODE_FAST) == 0 ? decoder.decode(Arrays.asList(el.srcLabel.split("\\s+")), rerankedTable, setup.featureNames(), nBest) : decoder.decodeFast(Arrays.asList(el.srcLabel.split("\\s+")), rerankedTable, setup.featureNames(), nBest); return decoded; }