public String normalize(String input) { StringBuilder sb = new StringBuilder(input.length()); input = TextUtil.normalizeApostrophes(input.toLowerCase(TR)); for (char c : input.toCharArray()) { if (letterMap.containsKey(c) || c == '.' || c == '-') { sb.append(c); } else { sb.append("?"); } } return sb.toString(); }
public static String normalizeForAnalysis(String word) { String s = word.toLowerCase(Turkish.LOCALE); s = TurkishAlphabet.INSTANCE.normalizeCircumflex(s); String noDot = s.replace(".", ""); if (noDot.length() == 0) { noDot = s; } return TextUtil.normalizeApostrophes(noDot); }
void wordFeatures(String word, String featurePrefix, List<String> features) { if (word == null) { return; } features.add(featurePrefix + "Upper:" + Character.isUpperCase(word.charAt(0))); features.add(featurePrefix + "Punct:" + (word.length() == 1)); boolean allCap = true; for (char c : word.toCharArray()) { if (!Character.isUpperCase(c)) { allCap = false; break; } } features.add(featurePrefix + "AllCap:" + allCap); String s = TextUtil.normalizeApostrophes(word); int apostropheIndex = s.indexOf('\''); features.add(featurePrefix + "Apost:" + (apostropheIndex >= 0)); if (apostropheIndex >= 0) { String stem = word.substring(0, apostropheIndex); String ending = word.substring(apostropheIndex + 1); features.add(featurePrefix + "Stem:" + stem); features.add(featurePrefix + "Ending:" + ending); } } }
line = TextUtil.normalizeApostrophes(line); line = TextUtil.normalizeQuotesHyphens(line); line = TextUtil.normalizeSpacesAndSoftHyphens(line);
s = TextUtil.normalizeApostrophes(s); s = TextUtil.normalizeQuotesHyphens(s); s = TextUtil.normalizeSpacesAndSoftHyphens(s);
@Override public void run() throws Exception { initializeOutputDir(); IOUtil.checkDirectoryArgument(modelRoot, "Model Root"); IOUtil.checkFileArgument(inputPath, "Input File"); Path out = outDir.resolve(inputPath.toFile().getName() + ".ne"); List<String> lines = Files.readAllLines(inputPath, StandardCharsets.UTF_8); List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraphs(lines); Log.info("There are %d lines and about %d sentences", lines.size(), sentences.size()); TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); PerceptronNer ner = PerceptronNer.loadModel(modelRoot, morphology); Stopwatch sw = Stopwatch.createStarted(); int tokenCount = 0; try (PrintWriter pw = new PrintWriter(out.toFile(), "UTF-8")) { for (String sentence : sentences) { sentence = TextUtil.normalizeApostrophes(sentence); sentence = TextUtil.normalizeQuotesHyphens(sentence); sentence = TextUtil.normalizeSpacesAndSoftHyphens(sentence); List<String> words = TurkishTokenizer.DEFAULT.tokenizeToStrings(sentence); tokenCount += words.size(); NerSentence result = ner.findNamedEntities(sentence, words); pw.println(result.getAsTrainingSentence(annotationStyle)); } } double secs = sw.elapsed(TimeUnit.MILLISECONDS) / 1000d; Log.info("Token count = %s", tokenCount); Log.info("File processed in %.4f seconds.", secs); Log.info("Speed = %.2f tokens/sec", tokenCount / secs); Log.info("Result is written in %s", out); }