private static void removeZemberekDictionaryWordsFromList(Path input, Path out) throws IOException { LinkedHashSet<String> list = new LinkedHashSet<>( Files.readAllLines(input, StandardCharsets.UTF_8)); System.out.println("Total amount of lines = " + list.size()); TurkishMorphology morphology = TurkishMorphology.create( RootLexicon.builder().addTextDictionaryResources( "tr/master-dictionary.dict", "tr/non-tdk.dict", "tr/proper.dict", "tr/proper-from-corpus.dict", "tr/abbreviations.dict" ).build()); List<String> toRemove = new ArrayList<>(); for (DictionaryItem item : morphology.getLexicon()) { if (list.contains(item.lemma)) { toRemove.add(item.lemma); } } System.out.println("Total amount to remove = " + toRemove.size()); list.removeAll(toRemove); try (PrintWriter pw = new PrintWriter(out.toFile(), "utf-8")) { list.forEach(pw::println); } }
static void foobar() throws IOException { Path path = Paths .get("/home/aaa/projects/zemberek-nlp/morphology/src/main/resources/tr/person-names.dict"); Path path2 = Paths .get( "/home/aaa/projects/zemberek-nlp/morphology/src/main/resources/tr/person-names-reduced.dict"); List<String> bb = Files.readAllLines(path); TurkishMorphology morphology = TurkishMorphology.create( RootLexicon.builder().addTextDictionaryResources( "tr/master-dictionary.dict", "tr/non-tdk.dict", "tr/proper.dict", "tr/proper-from-corpus.dict", "tr/abbreviations.dict").build()); List<String> r = new ArrayList<>(); for (String s : bb) { if (s.trim().length() == 0) { continue; } s = s.replaceAll("[ ]+", " ").trim(); DictionaryItem d = TurkishDictionaryLoader.loadFromString(s); if (!morphology.getLexicon().containsItem(d)) { r.add(s.trim()); } } r.sort(Turkish.STRING_COMPARATOR_ASC); Files.write(path2, r); }
public AmbiguityScriptsBase() throws IOException { try { identifier = LanguageIdentifier.fromInternalModelGroup("tr_group"); } catch (IOException e) { e.printStackTrace(); } this.morphology = TurkishMorphology.create(RootLexicon.builder() .addTextDictionaryResources( "tr/master-dictionary.dict", "tr/non-tdk.dict", "tr/proper.dict", "tr/proper-from-corpus.dict", "tr/abbreviations.dict", "tr/person-names.dict" ).build()); }
.asCharSource(dataPath.toFile(), Charsets.UTF_8).readLines(new DataSetLoader()); TurkishMorphology morphology = TurkishMorphology.create( RootLexicon.builder().addTextDictionaryResources( "tr/master-dictionary.dict",