Document doc = db.parse(media.getContent().newInput());
MediaContainer mc = MediaContainer.parseFrom(scannerResult.value()); for(Media media : mc.getMediaList()){ names.add(media.getMediaType());
private static String getDescription(String id, Media next) { StringBuilder builder = new StringBuilder("Id: ").append(id).append(" Key: ").append(next.getKey()); builder.append(" media: " + next.getMediaType()); return builder.toString(); }
List<DocumentProtos.Media> meList=mc.getMediaList(); for (DocumentProtos.Media m:meList) { if (m.getKey().equalsIgnoreCase(CrossrefConstants.KEY_CROSSREF_JSON_RECORD) || m.getKey().equalsIgnoreCase(CrossrefConstants.KEY_CROSSREF_UNIXREF_RECORD)){ if (m.getKey().equalsIgnoreCase(CCBW2Constants.KEY_SCHOLAR_RECORD)){ gsMedia=m; byte[] m=downloader.downloadUnixrefAndCheckCorrecteness(doi); if (m!=null) { DocumentProtos.Media.Builder mediaBuild=DocumentProtos.Media.newBuilder(); mediaBuild.setKey(CrossrefConstants.KEY_CROSSREF_UNIXREF_RECORD); mediaBuild.setMediaType(CrossrefConstants.TYPE_CROSSREF_UNIXREF_RECORD); byte[] m=downloader.downloadCrossrefJson(doi); if (m!=null) { DocumentProtos.Media.Builder mediaBuild=DocumentProtos.Media.newBuilder(); mediaBuild.setKey(CrossrefConstants.KEY_CROSSREF_JSON_RECORD); mediaBuild.setMediaType(CrossrefConstants.TYPE_CROSSREF_JSON_RECORD);
Media.Builder mediaBuilder = Media.newBuilder(); mediaBuilder.setKey(pdfInZip); mediaBuilder.setContent(ByteString.copyFrom(pdfByteArray));
LOGGER.debug("There were no suitable transformation. Available media types are: "); for (DocumentProtos.Media tmp : mc.getMediaList()) { LOGGER.debug("key: {}, type: {}", tmp.getKey(), tmp.getMediaType());
MediaContainer mediaContainer = docWrapper.getMediaContainer(); for (Media media : mediaContainer.getMediaList()) { logger.info("Processing file " + media.getSourcePath()); if (ProtoConstants.mediaTypePdf.equals(media.getMediaType())) { long fileSize; if (media.hasSourceFilesize()) { fileSize = media.getSourceFilesize(); } else { logger.warn("Source file size is not set in " + media.getKey() + ", using getSerializedSize() method"); fileSize = media.getSerializedSize(); InputStream pdfIS = media.getContent().newInput(); try { Media.Builder nlmMediaBuilder = Media.newBuilder(); nlmMediaBuilder.setCollection(media.getCollection()); nlmMediaBuilder.setKey(media.getKey()); nlmMediaBuilder.setSourceFilesize(nlmString.length()); nlmMediaBuilder.setContent(ByteString.copyFromUtf8(nlmString)); context.write(new Text(media.getKey()), new BytesWritable(nlmMediaBuilder.build().toByteArray())); } catch (AnalysisException ex) { logger.warn("cannot process PDF " + media.getSourcePath(), ex); } catch (ExceptionConverter ex) { logger.warn("cannot process PDF (unknown colorspace?) " + media.getSourcePath(), ex); logger.info("Finished " + media.getSourcePath()); } else { logger.info("File " + media.getSourcePath() + " is not a PDF");
Media.Builder mediaBuilder = Media.newBuilder(); mediaBuilder.setKey(docDTO.getKey());
NlmToYTransformer jatsReader=new NlmToYTransformer(); String art=new String(media.getContent().toByteArray(),"UTF-8"); List<YExportable> exps=null; if (art.contains(" PUBLIC \"-//NLM//DTD JATS ")) {
if (media.getMediaType().equals(ProtoConstants.mediaTypePdf)) { try { sb.append(extractTextFromPdf(media.getContent().toByteArray(), this.lang)); } catch (Exception ex) { logger.error("Cannot extract text from PDF: " + ex.toString() + " " + media.getSourcePath()); } else if (media.getMediaType().equals(ProtoConstants.mediaTypeTxt)) { sb.append(filterTextByLang(media.getContent().toStringUtf8(), lang.getLangCode()));
LOGGER.info("\tMediaConteiner size: " + (mediaSize / 1024 / 1024) + "MB"); for (Media media : mediaConteiner.getMediaList()) { long size = media.getSourceFilesize() / 1024 / 1024; LOGGER.info("\tSourcePath = " + media.getSourcePath()); LOGGER.info("\tSourcePathFilesize = " + size + "MB"); mediaCount++;
OafId rootId=new OafId(docNewId); rootId.isComacId=true; List<MultiTypeParseResult> readResults=reader.read(new String(media.getContent().toByteArray(),"UTF-8"),rootId,idTranslator); readResults.stream().forEach((MultiTypeParseResult t) -> { t.getDocuments().forEach((DocumentProtos.DocumentWrapperOrBuilder b) -> {
@Override public boolean transform(DocumentProtos.Media media, String docNewId, MultiTypeParseResult result, DocumentProtos.MediaContainerOrBuilder mediaToCopy) { try { log.info("Opeanir records parsing: "+docNewId); PbnId rootId=new PbnId(docNewId); rootId.isComacId=true; List<MultiTypeParseResult> readResults=reader.read(new String(media.getContent().toByteArray(),"UTF-8"),rootId); readResults.stream().forEach((MultiTypeParseResult t) -> { t.getDocuments().forEach((DocumentProtos.DocumentWrapperOrBuilder b) -> { result.add(b); }); t.getPersons().forEach((PersonProtos.PersonWrapperOrBuilder b) -> { result.add(b); }); t.getProjects().forEach((ProjectProtos.ProjectWrapperOrBuilder b) -> { result.add(b); }); t.getOrganizations().forEach((OrganizationProtos.OrganizationWrapperOrBuilder b) -> { result.add(b); }); }); return readResults.size()>=1; } catch (UnsupportedEncodingException ex) { log.error(ex.getMessage(),ex); } return false; } }
@Override public BWMetaFile toProtoBuf(Row row) { for (Column column : row.getColumns()) { if (!Arrays.areEqual(BWMETA_QUALIFIER.getBytes(), column.getQualifier())) { continue; } try { DocumentProtos.MediaContainer mediaContainer = DocumentProtos.MediaContainer.parseFrom(column.getValue()); for (Media media : mediaContainer.getMediaList()) { if (BWMeta2Constants.KEY_BWMETA2_RECORD.equals(media.getKey())) { return fromMedia(new String(row.getId()), media); } } } catch (InvalidProtocolBufferException e) { throw new RuntimeException("Invalid data:", e); } } return null; }
@Override public boolean transform(DocumentProtos.Media media, String docNewId, DocumentProtos.DocumentMetadata.Builder dmBuider,DocumentProtos.DocumentWrapper.Builder builder) { try { String mediaCF=media.getContent().toStringUtf8(); List<DocumentProtos.DocumentWrapperOrBuilder> docs = reader.read(mediaCF); if (docs.size() == 1) { DocumentProtos.DocumentMetadata dm = docs.get(0).getDocumentMetadata(); dmBuider.mergeFrom(dm); dmBuider.clearCollection(); List<YExportable> list=yreader.read(mediaCF); addBwmetaMedia(list, docNewId, builder); return true; } else { LOGGER.error("There was exactly one record in input string; number of output items: " + docs.size()); } } catch (Exception ex) { LOGGER.error("Error: ", ex); } return false; }
@Override public boolean transform(Media media, String docNewId, DocumentMetadata.Builder dmBuider,DocumentProtos.DocumentWrapper.Builder builder) { byte[] b = media.getContent().toByteArray(); byte[] c = Arrays.copyOf(b, b.length); ScholarRecordP record; BasicMetadata.Builder bmBuilder = BasicMetadata.newBuilder(); try { record = ScholarRecordP.parseFrom(c); } catch (InvalidProtocolBufferException ex) { java.util.logging.Logger.getLogger(GsMediaToBw2Metadata.class.getName()).log(Level.SEVERE, null, ex); return false; } AuthorData ad = new AuthorData(); ad.docId = docNewId; if (translateGoogleScholarToDocumentMetadata(record, dmBuider, bmBuilder, ad)) { dmBuider.setBasicMetadata(bmBuilder); return true; } return false; } }
@SuppressWarnings("unchecked") boolean transformAndReturnSuccess(String rowId, MultiTypeParseResult result, DocumentProtos.MediaContainerOrBuilder media, String newId,boolean transformed) { for (ProtoMediaMetadataToMetadata transformer : transformers) { for (Media m : media.getMediaList()) { if (m.hasKey() && transformer.getSupportedKey().equalsIgnoreCase(m.getKey())) { try { transformed = transformer.transform(m, newId, result,media) || transformed; } catch (Exception e ) { LOGGER.error("exception at row id: "+rowId+" transfomer: "+transformer.getSupportedKey(), e); } } } } return transformed; }
@Override public boolean transform(DocumentProtos.Media media, String docNewId, DocumentProtos.DocumentMetadata.Builder dmBuider,DocumentProtos.DocumentWrapper.Builder builder) { List<YExportable> yExportableList = MetadataTransformers.BTF.getReader( BwmetaTransformerConstants.BWMETA_2_1, BwmetaTransformerConstants.Y).read( new InputStreamReader(media.getContent().newInput())); for (YExportable yExportable : yExportableList) { DocumentProtos.DocumentMetadata dm = parser .yelementToDocumentMetadata((YElement) yExportable, null, null, "synat"); dmBuider.mergeFrom(dm); } dmBuider.clearCollection(); return true; }
@Override public boolean transform(DocumentProtos.Media media, String docNewId, DocumentProtos.DocumentMetadata.Builder dmBuider,DocumentProtos.DocumentWrapper.Builder builder) { try { List<DocumentProtos.DocumentWrapperOrBuilder> docs = reader.read(media.getContent().toStringUtf8()); if (docs.size() == 1) { DocumentProtos.DocumentMetadata dm = docs.get(0).getDocumentMetadata(); dmBuider.mergeFrom(dm); dmBuider.clearCollection(); return true; } else { LOGGER.error("There was exactly one record in input string; number of output items: " + docs.size()); } } catch (Exception ex) { LOGGER.error("Error: ", ex); } return false; } }
private static BWMetaFile fromMedia(String id, Media media) { Resource resource = new InMemoryResource(media.getContent().toByteArray(), getDescription(id, media)); return new BWMetaFile(id, StringUtils.EMPTY, resource, null); }