@Override public int getLongestForeignPhrase() { if(longestForeignPhrase < 0) { for(PhraseTableEntry pte : this) { final int l = pte.getForeign().asString().split("\\s+").length; if(l > longestForeignPhrase) { longestForeignPhrase = l; } } } return longestForeignPhrase; }
public double percentUnigramsInLM(Translation translation) { final LanguageModel nGramSource = getModel(translation.getSourceLabel().getLanguage()); if (nGramSource == null) { return 1.0; } final List<String> tokens = getTokens(translation.getSourceLabel()); int count = 0; for (String s : tokens) { if (Double.isInfinite(nGramSource.score(Arrays.asList(s)))) { count++; } } return (double) (tokens.size() - count) / tokens.size(); }
public double aveTranslationCount(Translation translation, double minProb) { final TranslationSource source = getSource(translation.getSourceLabel().getLanguage(), translation.getTargetLabel().getLanguage()); if (source == null) { return 0.0; } final double p = Math.log(minProb); final List<String> tokens = getTokens(translation.getSourceLabel()); int transCt = 0; for (String token : tokens) { final PhraseTable candidates = source.candidates(new ChunkImpl(token)); for (PhraseTableEntry entry : candidates) { if (entry.getFeatures()[2].score >= p) { transCt++; } } } return (double) transCt / tokens.size(); }
@Override public int getLongestForeignPhrase() { if(longestForeignPhrase < 0) { for(PhraseTableEntry pte : this) { final int l = pte.getForeign().asString().split("\\s+").length; if(l > longestForeignPhrase) { longestForeignPhrase = l; } } } return longestForeignPhrase; }
public double targetLMProb(Translation translation) { final LanguageModel model = getModel(translation.getTargetLabel().getLanguage()); if (model == null) { return 0; } return lmProb(model, getTokens(translation.getTargetLabel())); }
private List<String> getTokens(Label label) { if (label instanceof TokenizedLabel) { return ((TokenizedLabel) label).getTokens(); } else { return Arrays.asList(label.asString().split("\\s+")); } }
public double sourceLMProb(Translation translation) { final LanguageModel model = getModel(translation.getSourceLabel().getLanguage()); if (model == null) { return 0; } return lmProb(model, getTokens(translation.getSourceLabel())); }
private static List<String> getTokens(Label label) { if (label instanceof TokenizedLabel) { return ((TokenizedLabel) label).getTokens(); } else { return Arrays.asList(label.asString().split("\\s+")); } }
public double[] percentNGramsInTopBotQuartile(Translation translation, int n) { final LanguageModel nGramSource = getModel(translation.getSourceLabel().getLanguage()); if (nGramSource == null) { return new double[]{0.0, 0.0};
@Override public int getLongestForeignPhrase() { if(longestForeignPhrase < 0) { for(PhraseTableEntry pte : this) { final int l = pte.getForeign().asString().split("\\s+").length; if(l > longestForeignPhrase) { longestForeignPhrase = l; } } } return longestForeignPhrase; }
@Override public int getLongestForeignPhrase() { if(longestForeignPhrase < 0) { for(PhraseTableEntry pte : this) { final int l = pte.getForeign().asString().split("\\s+").length; if(l > longestForeignPhrase) { longestForeignPhrase = l; } } } return longestForeignPhrase; }
@Override public int getLongestForeignPhrase() { if(longestForeignPhrase < 0) { for(PhraseTableEntry pte : this) { final int l = pte.getForeign().asString().split("\\s+").length; if(l > longestForeignPhrase) { longestForeignPhrase = l; } } } return longestForeignPhrase; }
private static double countPunctuation(Label label) { int count = 0; if (label instanceof TokenizedLabel) { for (String token : ((TokenizedLabel) label).getTokens()) { if (token.matches(puncClass)) { count++; } } } else { for (String token : label.asString().split("\\b")) { if (token.trim().matches(puncClass)) { count++; } } } return count; }
@Override public double score(PhraseTableEntry entry, Entity entity) { if (!srcLabels.containsKey(entity)) { return 0; } assert (trgLabels.containsKey(entity)); final String srcLabel = entry.getForeign().asString(); final String trgLabel = entry.getTranslation().asString(); SRC_CHECK: { for (String srcCandidate : srcLabels.get(entity)) { if (srcLabel.contains(srcCandidate)) { break SRC_CHECK; } } return 0; } for (String trgCandidate : trgLabels.get(entity)) { if (trgLabel.contains(trgCandidate)) { return 1; } } return 0; }
public void setPhraseTable(eu.monnetproject.translation.PhraseTable pt, List<String> featureNames) { ptMap = new HashMap<String, List<PhraseTableEntry>>(); for(PhraseTableEntry pte : pt) { final String key = pte.getForeign().asString(); if(!ptMap.containsKey(key)) { ptMap.put(key, new ArrayList<PhraseTableEntry>()); } ptMap.get(key).add(pte); } ptName = pt.getName(); ptLongestPhrase = pt.getLongestForeignPhrase(); scoreNames = featureNames; //log.info("New lfp " + ptLongestPhrase); }
public static double aveOccurencesInTarget(Translation translation) { final HashMap<String, Integer> occMap = new HashMap<String, Integer>(); final Label label = translation.getTargetLabel(); if (label instanceof TokenizedLabel) { for (String token : ((TokenizedLabel) label).getTokens()) { if (occMap.containsKey(token)) { occMap.put(token, 1 + occMap.get(token)); } else { occMap.put(token, 1); } } } else { for (String token : label.asString().split("\\s+")) { if (occMap.containsKey(token)) { occMap.put(token, 1 + occMap.get(token)); } else { occMap.put(token, 1); } } } double aveCount = 0.0; for (int i : occMap.values()) { aveCount += i; } return aveCount / occMap.size(); }
public static double countTksInSrc(Translation translation) { if (translation.getSourceLabel() instanceof TokenizedLabel) { return ((TokenizedLabel) translation.getSourceLabel()).getTokens().size(); } else { return translation.getSourceLabel().asString().split("\\s+").length; } }
public static double countTksInTrg(Translation translation) { if (translation.getTargetLabel() instanceof TokenizedLabel) { return ((TokenizedLabel) translation.getTargetLabel()).getTokens().size(); } else { return translation.getTargetLabel().asString().split("\\s+").length; } }
public static double aveSrcTkLen(Translation translation) { double length = 0.0; int n = 0; if (translation.getTargetLabel() instanceof TokenizedLabel) { for (String token : ((TokenizedLabel) translation.getSourceLabel()).getTokens()) { length += token.length(); n++; } } else { for (String token : translation.getSourceLabel().asString().split("\\s+")) { length += token.length(); n++; } } return length / n; } static public final double MOSES_LM_UNKNOWN_WORD_SCORE = -100;
@Override public double score(PhraseTableEntry entry, Entity entity) { String candidate = entry.getTranslation().asString(); String srcString = null; if(ontoDoc!=null) srcString = ontoDoc; if(nearByTermsAsContext) srcString = entity.toString() ; if(onlyChunkAsContext) srcString = entry.getForeign().asString(); if(sourceLabelAsContext) srcString = entity.toString(); if(srcString.equalsIgnoreCase(ontoDoc)) return clesa.scoreAgainstVector(new Pair<String, Language>(candidate, trgLang), ontoVector); else return clesa.score(new Pair<String, Language>(candidate, trgLang), new Pair<String, Language>(srcString, srcLang)); }