void mergeDocuments(String key, Iterable<BytesWritable> values) throws IOException, InterruptedException { List<DocumentProtos.DocumentWrapper> dwList = new ArrayList<>(); values.forEach((bw) -> { try { dwList.add(DocumentProtos.DocumentWrapper.parseFrom(bw.copyBytes())); } catch (InvalidProtocolBufferException ex) { Logger.getLogger(HBaseToProtosReducer.class.getName()).log(Level.SEVERE, null, ex); } }); DocumentProtos.DocumentWrapper merged = docDuplicatesMerger.merge(dwList); mos.write(new Text(key), new BytesWritable(merged.toByteArray()), Type.DOCUMENT.name()+"/"); }
@Override public void map(Writable key, BytesWritable value, Mapper<Writable, BytesWritable, Text, BytesWritable>.Context context) throws IOException, InterruptedException { DocumentWrapper docWrapper = DocumentProtos.DocumentWrapper.parseFrom(value.copyBytes()); String docKey = keyGen.generateKey(docWrapper.getDocumentMetadata()); if (!docKey.isEmpty()) { DocumentWrapper thinDocWrapper = DocumentWrapperUtils.cloneDocumentMetadata(docWrapper); context.write(new Text(docKey), new BytesWritable(thinDocWrapper.toByteArray())); } } //******************** PRIVATE ********************
@Override public Tuple exec(Tuple tuple) throws IOException { checkCorrectness(tuple); try{ DocumentWrapper.Builder dwb = mainBlockParsing(tuple); int i = -1; for(String s : actions){ i++; if(i == mainGroupIndex) continue; try { IMerge merger = (IMerge) Class.forName("pl.edu.icm.coansys.output.merge.all.strategies."+MergeMapping.hm.get(s)).newInstance(); dwb = merger.execute(tuple, 2*i+1, dwb); } catch (Exception e) { LOGGER.error(ERROR_STRING, e); } } Tuple result = tupleFactory.newTuple(); result.append(docId); result.append(new DataByteArray(dwb.build().toByteArray())); return result; }catch(IOException e){ LOGGER.error(StackTraceExtractor.getStackTrace(e), e); throw e; } }
byte[] bw=null; if (dwo instanceof DocumentProtos.DocumentWrapper) { bw=((DocumentProtos.DocumentWrapper) dwo).toByteArray(); } else { bw=((DocumentProtos.DocumentWrapper.Builder) dwo).build().toByteArray();
context.write(outKey, new BytesWritable(builder.build().toByteArray())); return; } else {
t.append(new DataByteArray(commonDocumentWrapper.build().toByteArray()));
private static void generateSequenceFile(String inputDir, String collection, String outputSequenceFile, boolean isSnappyCompressed, boolean metadataOnly, long contentSizeLimit) throws IOException { ZipDirToDocumentDTOIterator zdtp = new ZipDirToDocumentDTOIterator(inputDir, collection, metadataOnly, contentSizeLimit); SequenceFile.Writer writer = null; try { BytesWritable rowKeyBytesWritable = new BytesWritable(); BytesWritable documentWrapperBytesWritable = new BytesWritable(); writer = createSequenceFileWriter(outputSequenceFile, rowKeyBytesWritable, documentWrapperBytesWritable, isSnappyCompressed); for (DocumentDTO doc : zdtp) { DocumentWrapper docWrap = buildFrom(doc); // specify key and value byte[] rowKey = docWrap.getRowId().getBytes(); rowKeyBytesWritable.set(rowKey, 0, rowKey.length); byte[] dwBytes = docWrap.toByteArray(); //workaround for bug in BytesWritable class if (documentWrapperBytesWritable.getCapacity() < dwBytes.length) { int newCapacity = Math.max(dwBytes.length, dwBytes.length / 4 * 5); documentWrapperBytesWritable.setCapacity(newCapacity); } documentWrapperBytesWritable.set(dwBytes, 0, dwBytes.length); // append to the sequence file writer.append(rowKeyBytesWritable, documentWrapperBytesWritable); if (documentCount % 10000 == 0) { printStats(); } } } finally { IOUtils.closeStream(writer); } }
t.append(new DataByteArray(documentWrapper.build().toByteArray())); return t;
public Tuple exec(Tuple input) throws IOException { if (input == null || input.size() != 3) { return null; } try{ String key = (String) input.get(0); DocumentWrapper dw = DocumentWrapper.parseFrom(((DataByteArray) input.get(1)).get()); String correctedDoi = (String) input.get(2); DocumentWrapper.Builder dwb = DocumentWrapper.newBuilder(dw); DocumentMetadata.Builder dmb = DocumentMetadata.newBuilder(dw.getDocumentMetadata()); BasicMetadata.Builder bmb = BasicMetadata.newBuilder(dmb.getBasicMetadata()); bmb.setDoi(correctedDoi); dmb.setBasicMetadata(bmb); dwb.setDocumentMetadata(dmb); Tuple ret = TupleFactory.getInstance().newTuple(); ret.append(key); ret.append(new DataByteArray(dwb.build().toByteArray())); return ret; }catch(Exception e){ logger.error("Error in processing input row:"+ StackTraceExtractor.getStackTrace(e), e); throw new IOException("Caught exception processing input row:\n" + StackTraceExtractor.getStackTrace(e)); } } }
@Override public void map(ImmutableBytesWritable row, Result values, Context context) throws IOException, InterruptedException { converter.set(values, dw); byte[] rowId = converter.getRowId(); byte[] mproto = converter.getDocumentMetadata(); byte[] cproto = converter.getDocumentMedia(); DocumentWrapper documentWrapper = converter.toDocumentWrapper(rowId, mproto, cproto); byte[] dproto = documentWrapper.toByteArray(); key.set(rowId, 0, rowId.length); if (dproto != null) { documentProto.set(dproto, 0, dproto.length); mos.write("dproto", key, documentProto); context.getCounter(Counters.DPROTO).increment(1); } else { context.getCounter(Counters.DPROTO_SKIPPED).increment(1); } if (mproto != null) { metatdataProto.set(mproto, 0, mproto.length); mos.write(FAMILY_METADATA_DOCUMENT_QUALIFIER_PROTO, key, metatdataProto); context.getCounter(Counters.MPROTO).increment(1); } else { context.getCounter(Counters.MPROTO_SKIPPED).increment(1); } }
@Override public Tuple exec(Tuple input) throws IOException { Tuple result = TupleFactory.getInstance().newTuple(); List<DocumentWrapper> duplDocs = new ArrayList<DocumentWrapper>(); DataBag bag = (DataBag) input.get(1); for (Tuple tpl : bag) { DocumentWrapper dw = DocumentWrapper.parseFrom(((DataByteArray) tpl.get(1)).get()); duplDocs.add(dw); } DocumentWrapper merged = merger.merge(duplDocs); result.append(merged.getRowId()); result.append(new DataByteArray(merged.toByteArray())); return result; } }
@Override public Tuple exec(Tuple tuple) throws IOException { checkCorrectness(tuple); String rowId = (String) tuple.get(0); DataByteArray origDocDBA = (DataByteArray) tuple.get(1); DocumentWrapper.Builder dwb = DocumentWrapper.newBuilder(DocumentWrapper.parseFrom(origDocDBA.get())); dwb = merger.execute(tuple, 3, dwb); DataByteArray resultDocDBA = new DataByteArray(dwb.build().toByteArray()); Tuple result = TupleFactory.getInstance().newTuple(); result.append(rowId); result.append(resultDocDBA); return result; }