line = TextUtil.normalizeApostrophes(line); line = TextUtil.normalizeQuotesHyphens(line); line = TextUtil.normalizeSpacesAndSoftHyphens(line);
s = TextUtil.normalizeApostrophes(s); s = TextUtil.normalizeQuotesHyphens(s); s = TextUtil.normalizeSpacesAndSoftHyphens(s); s = removeMultipleSymbols(s);
@Override public void run() throws Exception { initializeOutputDir(); IOUtil.checkDirectoryArgument(modelRoot, "Model Root"); IOUtil.checkFileArgument(inputPath, "Input File"); Path out = outDir.resolve(inputPath.toFile().getName() + ".ne"); List<String> lines = Files.readAllLines(inputPath, StandardCharsets.UTF_8); List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraphs(lines); Log.info("There are %d lines and about %d sentences", lines.size(), sentences.size()); TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); PerceptronNer ner = PerceptronNer.loadModel(modelRoot, morphology); Stopwatch sw = Stopwatch.createStarted(); int tokenCount = 0; try (PrintWriter pw = new PrintWriter(out.toFile(), "UTF-8")) { for (String sentence : sentences) { sentence = TextUtil.normalizeApostrophes(sentence); sentence = TextUtil.normalizeQuotesHyphens(sentence); sentence = TextUtil.normalizeSpacesAndSoftHyphens(sentence); List<String> words = TurkishTokenizer.DEFAULT.tokenizeToStrings(sentence); tokenCount += words.size(); NerSentence result = ner.findNamedEntities(sentence, words); pw.println(result.getAsTrainingSentence(annotationStyle)); } } double secs = sw.elapsed(TimeUnit.MILLISECONDS) / 1000d; Log.info("Token count = %s", tokenCount); Log.info("File processed in %.4f seconds.", secs); Log.info("Speed = %.2f tokens/sec", tokenCount / secs); Log.info("Result is written in %s", out); }