public static List<Pair<String,String>> readTextPairs(String inputFileUri) { List<Pair<String,String>> texts = Lists.newArrayList(); SequenceFile.Reader reader = null; try { Configuration conf = new Configuration(); reader = getSequenceFileReader(inputFileUri, conf); Writable key = (Writable)ReflectionUtils.newInstance(reader.getKeyClass(), conf); Writable value = (Writable)ReflectionUtils.newInstance(reader.getValueClass(), conf); while (reader.next(key, value)) { texts.add(new Pair<>(((Text)key).toString(),((Text)value).toString())); } } catch (IOException e) { } finally { IOUtils.closeStream(reader); } return texts; }
@Override public Vote vote(DocumentProtos.DocumentMetadata doc1, DocumentProtos.DocumentMetadata doc2) { Pair<String[], Boolean> doc1surnames = extractSurnames(doc1); Pair<String[], Boolean> doc2surnames = extractSurnames(doc2); if (doc1surnames.getX().length == 0 || doc2surnames.getX().length == 0) { return new Vote(Vote.VoteStatus.ABSTAIN); } float firstAuthorComponent = 0.0f; float allAuthorsMatchFactor = 0.95f; if (doc1surnames.getY() && doc2surnames.getY()) { String doc1firstAuthor = doc1surnames.getX()[0]; String doc2firstAuthor = doc2surnames.getX()[0]; SimilarityCalculator similarity = getSimilarityCalculator(); if (similarity.calculateSimilarity(doc1firstAuthor, doc2firstAuthor) > 0.5f) { firstAuthorComponent = 0.6f; allAuthorsMatchFactor = 0.4f; } else { allAuthorsMatchFactor = 0.9f; } } float probability = firstAuthorComponent + allAuthorsMatchFactor * allAuthorsMatching(doc1surnames.getX(), doc2surnames.getX()); if (probability > 1.0f) { probability = 1.0f; } if (probability <= 0.0f) { return new Vote(Vote.VoteStatus.NOT_EQUALS); } return new Vote(Vote.VoteStatus.PROBABILITY, probability); }
protected Map generateConcreteLanguageMap(DocumentMetadata dm, int lim) { String docTitle; String docAbstract; docTitle = extractLangTitle(dm); if (docTitle == null) { return null; } docAbstract = extractLangAbstract(dm); Pair<String, DataBag> kwCc = extractLangKeywords(dm); if (action == Action.TRANSLATE) { docTitle = translateNonAlphaNumeric(docTitle); docAbstract = translateNonAlphaNumeric(docAbstract); } else if (action == Action.REMOVE_KEYCHARACTERS) { docTitle = removeAllKeyPunctations(docTitle); docAbstract = removeAllKeyPunctations(docAbstract); } else { docTitle = removeAllNonAlphaNumeric(docTitle); docAbstract = removeAllNonAlphaNumeric(docAbstract); } if (kwCc.getY().size() > lim) { Map<String, Object> map = new HashMap<String, Object>(); map.put("key", dm.getKey()); map.put("title", docTitle); map.put("keywords", kwCc.getX()); map.put("abstract", docAbstract); map.put("categories", kwCc.getY()); return map; } return null; }
return new Pair<String, DataBag>(Joiner.on(" ").join(kws), db);
orderNb++; return new Pair<String[], Boolean>(positionsCorrect ? resultByPositionNb : resultByOrder, positionsCorrect);