private static List<String> getTokens(Label label) { if (label instanceof TokenizedLabel) { return ((TokenizedLabel) label).getTokens(); } else { return Arrays.asList(label.asString().split("\\s+")); } }
private List<String> getTokens(Label label) { if (label instanceof TokenizedLabel) { return ((TokenizedLabel) label).getTokens(); } else { return Arrays.asList(label.asString().split("\\s+")); } }
private static double countPunctuation(Label label) { int count = 0; if (label instanceof TokenizedLabel) { for (String token : ((TokenizedLabel) label).getTokens()) { if (token.matches(puncClass)) { count++; } } } else { for (String token : label.asString().split("\\b")) { if (token.trim().matches(puncClass)) { count++; } } } return count; }
public static double aveOccurencesInTarget(Translation translation) { final HashMap<String, Integer> occMap = new HashMap<String, Integer>(); final Label label = translation.getTargetLabel(); if (label instanceof TokenizedLabel) { for (String token : ((TokenizedLabel) label).getTokens()) { if (occMap.containsKey(token)) { occMap.put(token, 1 + occMap.get(token)); } else { occMap.put(token, 1); } } } else { for (String token : label.asString().split("\\s+")) { if (occMap.containsKey(token)) { occMap.put(token, 1 + occMap.get(token)); } else { occMap.put(token, 1); } } } double aveCount = 0.0; for (int i : occMap.values()) { aveCount += i; } return aveCount / occMap.size(); }
public static double countTksInSrc(Translation translation) { if (translation.getSourceLabel() instanceof TokenizedLabel) { return ((TokenizedLabel) translation.getSourceLabel()).getTokens().size(); } else { return translation.getSourceLabel().asString().split("\\s+").length; } }
public static double countTksInTrg(Translation translation) { if (translation.getTargetLabel() instanceof TokenizedLabel) { return ((TokenizedLabel) translation.getTargetLabel()).getTokens().size(); } else { return translation.getTargetLabel().asString().split("\\s+").length; } }
public static double aveSrcTkLen(Translation translation) { double length = 0.0; int n = 0; if (translation.getTargetLabel() instanceof TokenizedLabel) { for (String token : ((TokenizedLabel) translation.getSourceLabel()).getTokens()) { length += token.length(); n++; } } else { for (String token : translation.getSourceLabel().asString().split("\\s+")) { length += token.length(); n++; } } return length / n; } static public final double MOSES_LM_UNKNOWN_WORD_SCORE = -100;