private void saveDuplicatesToContext(Map<Integer, Set<DocumentProtos.DocumentMetadata>> sameWorksMap, Text key, Reducer<Text, BytesWritable, Text, Text>.Context context) throws IOException, InterruptedException { for (Map.Entry<Integer, Set<DocumentProtos.DocumentMetadata>> entry : sameWorksMap.entrySet()) { String sameWorksKey = key.toString() + "_" + entry.getKey(); for (DocumentProtos.DocumentMetadata doc : entry.getValue()) { context.write(new Text(sameWorksKey), new Text(doc.getKey())); } } }
Bytes.toBytes(HBaseConstant.FAMILY_METADATA_QUALIFIER_PROTO))); key = new Text(dm.getKey());
String key = dm.getKey();
String key = dm.getKey();
String key = dm.getKey(); DataBag db = new DefaultDataBag(); int bagsize = 0;
String key = dm.getKey(); boolean hasCateg = false; if (dm.getBasicMetadata().getClassifCodeCount() > 0) {
if (db.size() > 0) { Map<String, Object> map = new HashMap<String, Object>(); map.put("key", metadata.getKey()); map.put("title", titles); map.put("keywords", getConcatenated(metadata.getKeywordsList()));
@Override public Map exec(Tuple input) throws IOException { try { DataByteArray protoMetadata = (DataByteArray) input.get(0); DocumentMetadata metadata = DocumentMetadata.parseFrom(protoMetadata.get()); String titles; String abstracts; List<String> titleList = new ArrayList<String>(); for (TextWithLanguage title : metadata.getBasicMetadata().getTitleList()) { titleList.add(title.getText()); } titles = Joiner.on(" ").join(titleList); List<String> abstractsList = new ArrayList<String>(); for (TextWithLanguage documentAbstract : metadata.getBasicMetadata().getTitleList()) { abstractsList.add(documentAbstract.getText()); } abstracts = Joiner.on(" ").join(abstractsList); Map<String, Object> map = new HashMap<String, Object>(); map.put("key", metadata.getKey()); map.put("title", titles); map.put("keywords", getConcatenated(metadata.getKeywordsList())); map.put("abstract", abstracts); map.put("categories", getCategories(metadata.getBasicMetadata().getClassifCodeList())); return map; } catch (Exception e) { logger.error("Error in processing input row:", e); throw new IOException("Caught exception processing input row:\n" + StackTraceExtractor.getStackTrace(e)); } }
map.put("key", metadata.getKey()); map.put("title", titles); map.put("keywords", getConcatenated(metadata.getKeywordsList()));
String key = dm.getKey(); DataBag db = new DefaultDataBag(); int bagsize = 0;
@Override public Tuple exec(Tuple input) throws IOException { if (input == null || input.size() == 0) { return null; } try { Object obj = (DataByteArray) input.get(1); DataByteArray dba = (DataByteArray) obj; DocumentMetadata dm = DocumentMetadata.parseFrom(dba.get()); String key = dm.getKey(); DataBag db = new DefaultDataBag(); for (ClassifCode code : dm.getBasicMetadata().getClassifCodeList()) { if (ProtoConstants.documentClassifCodeMsc.equals(code.getSource())) { db.add(TupleFactory.getInstance().newTuple(code.getValueList())); } } Object[] to = new Object[]{key, db}; return TupleFactory.getInstance().newTuple(Arrays.asList(to)); } catch (Exception e) { logger.error("Error in processing input row:", e); throw new IOException("Caught exception processing input row:\n" + StackTraceExtractor.getStackTrace(e)); } } }
protected Map generateConcreteLanguageMap(DocumentMetadata dm, int lim) { String docTitle; String docAbstract; docTitle = extractLangTitle(dm); if (docTitle == null) { return null; } docAbstract = extractLangAbstract(dm); Pair<String, DataBag> kwCc = extractLangKeywords(dm); if (action == Action.TRANSLATE) { docTitle = translateNonAlphaNumeric(docTitle); docAbstract = translateNonAlphaNumeric(docAbstract); } else if (action == Action.REMOVE_KEYCHARACTERS) { docTitle = removeAllKeyPunctations(docTitle); docAbstract = removeAllKeyPunctations(docAbstract); } else { docTitle = removeAllNonAlphaNumeric(docTitle); docAbstract = removeAllNonAlphaNumeric(docAbstract); } if (kwCc.getY().size() > lim) { Map<String, Object> map = new HashMap<String, Object>(); map.put("key", dm.getKey()); map.put("title", docTitle); map.put("keywords", kwCc.getX()); map.put("abstract", docAbstract); map.put("categories", kwCc.getY()); return map; } return null; }
private Tuple addDocumentMetatdataFields(DocumentMetadata metadata, Tuple output) throws ExecException { output.set(fieldNumberMap.get(C.KEY), metadata.getKey()); appendToOutput(output, C.TITLE, metadata.getBasicMetadata() .getTitleList()); appendToOutput(output, C.ABSTRACT_TEXT, metadata.getDocumentAbstractList()); List<String> al = new ArrayList<String>(); for (KeywordsList kl : metadata.getKeywordsList()) { for (String s : kl.getKeywordsList()) { al.add(removeAllPigUnfriendlySigns(s)); } } output.set(fieldNumberMap.get(C.KEYWORDS), listToDataBag(al)); List<String> authorKeys = new ArrayList<String>(); List<String> authorNames = new ArrayList<String>(); for (Author author : metadata.getBasicMetadata().getAuthorList()) { authorKeys.add(author.getKey()); authorNames.add(author.getName()); } output.set(fieldNumberMap.get(C.CONTRIBUTORS), listToDataBag(authorKeys, authorNames)); return output; }
@SuppressWarnings("rawtypes") public DocumentDTO transformYElement(YExportable yExportable, ZipArchive currentZipArchive, String currentXmlPath, boolean metadataOnly, long contentSizeLimit) { DocumentDTO productObject = null; if (yExportable instanceof YElement) { YElement yElement = (YElement) yExportable; MetadataToProtoMetadataParser mtd2prt = new MetadataToProtoMetadataParserImpl(); DocumentMetadata docMetadata = mtd2prt.yelementToDocumentMetadata(yElement, currentZipArchive, currentXmlPath, collection); if (docMetadata != null) { productObject = new DocumentDTO(); productObject.setKey(docMetadata.getKey()); //Document and DocumentMetadata should have the same key? productObject.setDocumentMetadata(docMetadata); if (!metadataOnly) { List<YContentEntry> contents = yElement.getContents(); for (YContentEntry content : contents) { //get a media path from yElement handleContent(productObject, content, currentZipArchive, contentSizeLimit); } } } } return productObject; }
@Override public Tuple exec(Tuple input) throws IOException { if (input == null || input.size() == 0) { return null; } try { Object obj = (DataByteArray) input.get(0); DataByteArray dba = (DataByteArray) obj; DocumentMetadata dm = DocumentMetadata.parseFrom(dba.get()); String key = dm.getKey(); Object[] to = new Object[]{key}; return TupleFactory.getInstance().newTuple(Arrays.asList(to)); } catch (Exception e) { logger.error("Error in processing input row:", e); throw new IOException("Caught exception processing input row:\n" + StackTraceExtractor.getStackTrace(e)); } } }
@Override protected void map(Writable key, BytesWritable value, Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); String extractionOption = conf.get(EXTRACTION_OPTION); String lang = conf.get(EXTRACTION_LANGUAGE); DocumentWrapper docWrapper = DocumentProtos.DocumentWrapper.parseFrom(value.copyBytes()); RakeExtractor rakeExtractor = new RakeExtractor(docWrapper, extractionOption, lang); List<String> keywords = rakeExtractor.getKeywords(); if (keywords.size() > 0) { String docId = docWrapper.getDocumentMetadata().getKey(); DocumentProtos.KeywordsList.Builder kwdBuilder = DocumentProtos.KeywordsList.newBuilder(); DocumentProtos.ProvenanceInfo.Builder provenanceBuilder = DocumentProtos.ProvenanceInfo.newBuilder(); DocumentProtos.ProvenanceInfo.SingleProvenanceInfo.Builder singleProvenanceBuilder = DocumentProtos.ProvenanceInfo.SingleProvenanceInfo.newBuilder(); singleProvenanceBuilder.setLastModificationDate(new Date().getTime()); singleProvenanceBuilder.setLastModificationMarkerId(ALGORITHM_NAME); provenanceBuilder.setCurrentProvenance(singleProvenanceBuilder); kwdBuilder.setProvenance(provenanceBuilder); kwdBuilder.addAllKeywords(keywords); context.write(new Text(docId), new BytesWritable(kwdBuilder.build().toByteArray())); } } }
@Override public Tuple exec(Tuple tuple) throws IOException { if (tuple == null || tuple.size() != 2 || tuple.getType(1) != DataType.BYTEARRAY) { throw new IOException("" + this.getClass().getName() + " expects 2 arguments, 2nd must be a bytearray"); } String rowId = (String) tuple.get(0); DataByteArray protoDBA = (DataByteArray) tuple.get(1); byte[] protoBytes = protoDBA.get(); DocumentWrapper doc = DocumentProtos.DocumentWrapper.parseFrom(protoBytes); Tuple result = tupleFactory.newTuple(); result.append(rowId); result.append(doc.getDocumentMetadata().getKey()); result.append(protoDBA); return result; } }
@Override public Tuple exec(Tuple tuple) throws IOException { DataByteArray dba = (DataByteArray) tuple.get(1); DocumentProtos.DocumentWrapper docWrapper = DocumentProtos.DocumentWrapper.parseFrom(dba.get()); String id = docWrapper.getDocumentMetadata().getKey(); String title = docWrapper.getDocumentMetadata().getBasicMetadata().getTitle(0).getText(); Tuple retTuple = TupleFactory.getInstance().newTuple(Arrays.asList(id, title)); return retTuple; } }