public List<WordAnalysis> analyzeSentence(String sentence) { String normalized = TextUtil.normalizeQuotesHyphens(sentence); List<WordAnalysis> result = new ArrayList<>(); for (Token token : tokenizer.tokenize(normalized)) { result.add(analyze(token)); } return result; }
public List<String> readAll(String filename) throws IOException { List<String> lines = new ArrayList<>(); File file = new File(filename); LineIterator it = SimpleTextReader.trimmingUTF8Reader(file).getLineIterator(); while (it.hasNext()) { String quotesHyphensNormalzied = TextUtil.normalizeQuotesHyphens(it.next()); lines.add(Joiner.on(" ").join(lexer.tokenizeToStrings(quotesHyphensNormalzied))); } return lines; }
for (String line : lines) { line = TextUtil.normalizeApostrophes(line); line = TextUtil.normalizeQuotesHyphens(line); line = TextUtil.normalizeSpacesAndSoftHyphens(line);
s = TextUtil.normalizeQuotesHyphens(s); s = TextUtil.normalizeSpacesAndSoftHyphens(s); s = removeMultipleSymbols(s);
@Override public void run() throws Exception { initializeOutputDir(); IOUtil.checkDirectoryArgument(modelRoot, "Model Root"); IOUtil.checkFileArgument(inputPath, "Input File"); Path out = outDir.resolve(inputPath.toFile().getName() + ".ne"); List<String> lines = Files.readAllLines(inputPath, StandardCharsets.UTF_8); List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraphs(lines); Log.info("There are %d lines and about %d sentences", lines.size(), sentences.size()); TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); PerceptronNer ner = PerceptronNer.loadModel(modelRoot, morphology); Stopwatch sw = Stopwatch.createStarted(); int tokenCount = 0; try (PrintWriter pw = new PrintWriter(out.toFile(), "UTF-8")) { for (String sentence : sentences) { sentence = TextUtil.normalizeApostrophes(sentence); sentence = TextUtil.normalizeQuotesHyphens(sentence); sentence = TextUtil.normalizeSpacesAndSoftHyphens(sentence); List<String> words = TurkishTokenizer.DEFAULT.tokenizeToStrings(sentence); tokenCount += words.size(); NerSentence result = ner.findNamedEntities(sentence, words); pw.println(result.getAsTrainingSentence(annotationStyle)); } } double secs = sw.elapsed(TimeUnit.MILLISECONDS) / 1000d; Log.info("Token count = %s", tokenCount); Log.info("File processed in %.4f seconds.", secs); Log.info("Speed = %.2f tokens/sec", tokenCount / secs); Log.info("Result is written in %s", out); }