/** * Normal Constructor * @param name language name */ public LangProfile(String name) { this.setName(name); }
LangProfile langProfile = new LangProfile(); String[] keyValue = entry.split(":"); String label = keyValue[0].trim().replace("\"", ""); langProfile.getFreq().put(label, Integer.valueOf(keyValue[1])); if (m.find()) { String[] nWords = m.group(1).split(","); langProfile.setNWords(new int[nWords.length]); for (int i = 0; i < nWords.length; i++) { langProfile.getNWords()[i] = Integer.parseInt(nWords[i]); langProfile.setName(m.group(1));
writer.write("{\"freq\":{"); boolean first = true; for (Map.Entry<String, Integer> entry : langProfile.getFreq().entrySet()) { if (!first) { writer.write(','); for (int nWord : langProfile.getNWords()) { if (!first) { writer.write(','); writer.write(langProfile.getName()); writer.write("\"}"); writer.flush();
public static LanguageProfile convert(LangProfile langProfile) { LdLocale locale; try { locale = LdLocale.fromString(langProfile.getName()); } catch (Exception e) { throw new RuntimeException("Profile file name logic was changed in v0.5, please update your custom profiles!", e); } LanguageProfileBuilder builder = new LanguageProfileBuilder(locale); for (Map.Entry<String, Integer> entry : langProfile.getFreq().entrySet()) { builder.addGram(entry.getKey(), entry.getValue()); } return builder.build(); }
/** * Loads a text file and generate a language profile from its content. The input text file is supposed to be encoded in UTF-8. * @param lang target language name. * @param textFile input text file. * @return Language profile instance */ public static LangProfile generate(String lang, File textFile) { LangProfile profile = new LangProfile(lang); InputStream is = null; try { is = new BufferedInputStream(new FileInputStream(textFile)); if (textFile.getName().endsWith(".gz")) is = new GZIPInputStream(is); BufferedReader reader = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8"))); String line; while ((line = reader.readLine()) != null) { TextObject textObject = textObjectFactory.forText(" "+line+" "); Util.addCharSequence(profile, textObject); } } catch (IOException e) { throw new RuntimeException("Can't open training database file '" + textFile.getName() + "'", e); } finally { IOUtils.closeQuietly(is); } return profile; } }
public static void addCharSequence(LangProfile langProfile, CharSequence text) { //TODO replace with new code. // List<String> old = OldNgramExtractor.extractNGrams(text, null); // List<String> nuu = ngramExtractor.extractGrams(text); // // Set<String> oldSet = new HashSet<>(old); // Set<String> nuuSet = new HashSet<>(nuu); // // ArrayList<String> justNuu = new ArrayList<>(nuu); // justNuu.removeAll(old); // // ArrayList<String> justOld = new ArrayList<>(old); // justOld.removeAll(nuu); // // System.out.println(text); // for (String s : ngramExtractor.extractGrams(text)) { // langProfile.add(s); // } for (String s : OldNgramExtractor.extractNGrams(text, null)) { langProfile.add(s); } }
/** * Generate Language Profile from a text file. * * <pre> * usage: --genprofile [text file] [language name] * </pre> * */ public void generateProfile() { File directory = new File(arglist.get(0)); String lang = arglist.get(1); File file = searchFile(directory, lang + "wiki-.*-abstract\\.xml.*"); if (file == null) { System.err.println("Not Found text file : lang = " + lang); return; } try(FileOutputStream outputStream = new FileOutputStream(new File(lang))) { LangProfile profile = GenProfile.load(lang, file); profile.omitLessFreq(); new LangProfileWriter().write(profile, outputStream); } catch (IOException e) { e.printStackTrace(); } }
LangProfile langProfile = new LangProfile(); String[] keyValue = entry.split(":"); String label = keyValue[0].trim().replace("\"", ""); langProfile.getFreq().put(label, Integer.valueOf(keyValue[1])); if (m.find()) { String[] nWords = m.group(1).split(","); langProfile.setNWords(new int[nWords.length]); for (int i = 0; i < nWords.length; i++) { langProfile.getNWords()[i] = Integer.parseInt(nWords[i]); langProfile.setName(m.group(1));
writer.write("{\"freq\":{"); boolean first = true; for (Map.Entry<String, Integer> entry : langProfile.getFreq().entrySet()) { if (!first) { writer.write(','); for (int nWord : langProfile.getNWords()) { if (!first) { writer.write(','); writer.write(langProfile.getName()); writer.write("\"}"); writer.flush();
public static LanguageProfile convert(LangProfile langProfile) { LdLocale locale; try { locale = LdLocale.fromString(langProfile.getName()); } catch (Exception e) { throw new RuntimeException("Profile file name logic was changed in v0.5, please update your custom profiles!", e); } LanguageProfileBuilder builder = new LanguageProfileBuilder(locale); for (Map.Entry<String, Integer> entry : langProfile.getFreq().entrySet()) { builder.addGram(entry.getKey(), entry.getValue()); } return builder.build(); }
/** * Loads a text file and generate a language profile from its content. The input text file is supposed to be encoded in UTF-8. * @param lang target language name. * @param textFile input text file. * @return Language profile instance */ public static LangProfile generate(String lang, File textFile) { LangProfile profile = new LangProfile(lang); InputStream is = null; try { is = new BufferedInputStream(new FileInputStream(textFile)); if (textFile.getName().endsWith(".gz")) is = new GZIPInputStream(is); BufferedReader reader = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8"))); String line; while ((line = reader.readLine()) != null) { TextObject textObject = textObjectFactory.forText(" "+line+" "); Util.addCharSequence(profile, textObject); } } catch (IOException e) { throw new RuntimeException("Can't open training database file '" + textFile.getName() + "'", e); } finally { IOUtils.closeQuietly(is); } return profile; } }
public static void addCharSequence(LangProfile langProfile, CharSequence text) { //TODO replace with new code. // List<String> old = OldNgramExtractor.extractNGrams(text, null); // List<String> nuu = ngramExtractor.extractGrams(text); // // Set<String> oldSet = new HashSet<>(old); // Set<String> nuuSet = new HashSet<>(nuu); // // ArrayList<String> justNuu = new ArrayList<>(nuu); // justNuu.removeAll(old); // // ArrayList<String> justOld = new ArrayList<>(old); // justOld.removeAll(nuu); // // System.out.println(text); // for (String s : ngramExtractor.extractGrams(text)) { // langProfile.add(s); // } for (String s : OldNgramExtractor.extractNGrams(text, null)) { langProfile.add(s); } }
/** * Generate Language Profile from a text file. * * <pre> * usage: --genprofile [text file] [language name] * </pre> * */ public void generateProfile() { File directory = new File(arglist.get(0)); String lang = arglist.get(1); File file = searchFile(directory, lang + "wiki-.*-abstract\\.xml.*"); if (file == null) { System.err.println("Not Found text file : lang = " + lang); return; } try(FileOutputStream outputStream = new FileOutputStream(new File(lang))) { LangProfile profile = GenProfile.load(lang, file); profile.omitLessFreq(); new LangProfileWriter().write(profile, outputStream); } catch (IOException e) { e.printStackTrace(); } }
LangProfile profile = new LangProfile(lang);
/** * Normal Constructor * @param name language name */ public LangProfile(String name) { this.setName(name); }
LangProfile profile = new LangProfile(lang);