/** * Set prior information about language probabilities. * @param priorMap the priorMap to set * @throws LangDetectException */ public void setPriorMap(HashMap<String, Double> priorMap) throws LangDetectException { this.priorMap = new double[langlist.size()]; double sump = 0; for (int i=0;i<this.priorMap.length;++i) { String lang = langlist.get(i); if (priorMap.containsKey(lang)) { double p = priorMap.get(lang); if (p<0) throw new LangDetectException(ErrorCode.InitParamError, "Prior probability must be non-negative."); this.priorMap[i] = p; sump += p; } } if (sump<=0) throw new LangDetectException(ErrorCode.InitParamError, "More one of prior probability must be non-zero."); for (int i=0;i<this.priorMap.length;++i) this.priorMap[i] /= sump; }
static private Detector createDetector() throws LangDetectException { if (instance_.langlist.size()==0) throw new LangDetectException(ErrorCode.NeedLoadProfileError, "need to load profiles"); Detector detector = new Detector(instance_); return detector; }
/** * @param profile * @param langsize * @param index * @throws LangDetectException */ static /* package scope */ void addProfile(LangProfile profile, int index, int langsize) throws LangDetectException { String lang = profile.name; if (instance_.langlist.contains(lang)) { throw new LangDetectException(ErrorCode.DuplicateLangError, "duplicate the same language profile"); } instance_.langlist.add(lang); for (String word: profile.freq.keySet()) { if (!instance_.wordLangProbMap.containsKey(word)) { instance_.wordLangProbMap.put(word, new double[langsize]); } int length = word.length(); if (length >= 1 && length <= 3) { double prob = profile.freq.get(word).doubleValue() / profile.n_words[length - 1]; instance_.wordLangProbMap.get(word)[index] = prob; } } }
/** * @param is profile input stream */ public static LanguageProfile loadFromStream(InputStream is) throws LanguageDetectorException { try { LangProfile shuyoLangProfile = JSON.decode(is, LangProfile.class); return new LanguageProfile(shuyoLangProfile); } catch (JSONException e) { throw new LanguageDetectorException("Cannot load language profile.", new LangDetectException( ErrorCode.FormatError, "profile format error.")); } catch (IOException e) { throw new LanguageDetectorException("Cannot load language profile.", new LangDetectException( ErrorCode.FileLoadError, "can't read profile.")); } }
/** * Load profiles from specified directory. * This method must be called once before language detection. * * @param profileDirectory profile directory path * @throws LangDetectException Can't open profiles(error code = {@link ErrorCode#FileLoadError}) * or profile's format is wrong (error code = {@link ErrorCode#FormatError}) */ public static void loadProfile(List<String> json_profiles) throws LangDetectException { int index = 0; int langsize = json_profiles.size(); if (langsize < 2) throw new LangDetectException(ErrorCode.NeedLoadProfileError, "Need more than 2 profiles"); for (String json: json_profiles) { try { LangProfile profile = JSON.decode(json, LangProfile.class); addProfile(profile, index, langsize); ++index; } catch (JSONException e) { throw new LangDetectException(ErrorCode.FormatError, "profile format error"); } } }
File[] listFiles = profileDirectory.listFiles(); if (listFiles == null) throw new LangDetectException(ErrorCode.NeedLoadProfileError, "Not found profile: " + profileDirectory); ++index; } catch (JSONException e) { throw new LangDetectException(ErrorCode.FormatError, "profile format error in '" + file.getName() + "'"); } catch (IOException e) { throw new LangDetectException(ErrorCode.FileLoadError, "can't open '" + file.getName() + "'"); } finally { try {
throw new LanguageDetectorException( "Cannot initialize profile for language tag: " + language, new LangDetectException( ErrorCode.DuplicateLangError, "duplicate the same language profile"));
throw new LangDetectException(ErrorCode.CantOpenTrainData, "Can't open training database file '" + file.getName() + "'"); } finally { try {
public LanguageIdentifier() throws LangDetectException { DetectorFactory.clear(); try { DetectorFactory.loadProfile(loadProfiles("profiles","profiles.cfg")); } catch (Exception e) { throw new LangDetectException(null, "Error in Initialization: "+e.getMessage()); } } /**
/** * Loads language profiles from a referenced ZIP.<p> * * @param cms the cms object * @param path the path to the language profiles ZIP in the VFS * * @throws LangDetectException if something goes wrong */ public static void loadProfile(CmsObject cms, String path) throws LangDetectException { CmsFile file = null; try { file = cms.readFile(path); } catch (CmsException e) { throw new LangDetectException(ErrorCode.FileLoadError, "can't read resource from '" + path + "'"); } if (file != null) { ZipInputStream zipInput = new ZipInputStream(new ByteArrayInputStream(file.getContents())); loadProfile(zipInput); } }
throw new LangDetectException(ErrorCode.TrainDataFormatError, "Training database file '" + file.getName() + "' is an invalid XML."); } finally { try { throw new LangDetectException(ErrorCode.CantOpenTrainData, "Can't open training database file '" + file.getName() + "'"); } finally { try {
profiles.add(profile); } catch (JSONException e) { throw new LangDetectException(ErrorCode.FormatError, "profile format error in '" + entry.getName() + "'"); } catch (IOException e) { throw new LangDetectException(ErrorCode.FileLoadError, "can't open '" + entry.getName() + "'"); } finally { try {
/** * @throws LangDetectException * */ private void detectBlock() throws LangDetectException { cleaningText(); ArrayList<String> ngrams = extractNGrams(); if (ngrams.size()==0) throw new LangDetectException(ErrorCode.CantDetectError, "no features in text"); langprob = new double[langlist.size()]; Random rand = new Random(); if (seed != null) rand.setSeed(seed); for (int t = 0; t < n_trial; ++t) { double[] prob = initProbability(); double alpha = this.alpha + rand.nextGaussian() * ALPHA_WIDTH; for (int i = 0;; ++i) { int r = rand.nextInt(ngrams.size()); updateLangProb(prob, ngrams.get(r), alpha); if (i % 5 == 0) { if (normalizeProb(prob) > CONV_THRESHOLD || i>=ITERATION_LIMIT) break; if (verbose) System.out.println("> " + sortProbability(prob)); } } for(int j=0;j<langprob.length;++j) langprob[j] += prob[j] / n_trial; if (verbose) System.out.println("==> " + sortProbability(prob)); } }