addMetadata(metadata, property, pdfBoxBaseline); List<String> items = getXMPBagOrSeqList(dc, property.getName()); if (items == null) { if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) { addMetadata(metadata, property, pdfBoxBaseline); addMetadata(metadata, property, item); addMetadata(metadata, property, pdfBoxBaseline);
private void addMetadata(Metadata metadata, String name, String value) { if (value != null) { metadata.add(name, decode(value)); } }
Document dom = loadDOM(document.getDocumentCatalog().getMetadata(), metadata, context); extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema); addMetadata(metadata, PDF.DOC_INFO_TITLE, info.getTitle()); extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema); addMetadata(metadata, PDF.DOC_INFO_CREATOR, info.getAuthor()); extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema); addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator()); addMetadata(metadata, PDF.DOC_INFO_CREATOR_TOOL, info.getCreator()); addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords()); addMetadata(metadata, PDF.DOC_INFO_KEY_WORDS, info.getKeywords()); addMetadata(metadata, "producer", info.getProducer()); addMetadata(metadata, PDF.DOC_INFO_PRODUCER, info.getProducer()); extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema); addMetadata(metadata, PDF.DOC_INFO_SUBJECT, info.getSubject()); addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject()); addMetadata(metadata, "trapped", info.getTrapped()); addMetadata(metadata, PDF.DOC_INFO_TRAPPED, info.getTrapped()); addMetadata(metadata, "created", info.getCreationDate()); addMetadata(metadata, PDF.DOC_INFO_CREATED, info.getCreationDate()); addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate()); Calendar modified = info.getModificationDate(); addMetadata(metadata, Metadata.LAST_MODIFIED, modified); addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); addMetadata(metadata, PDF.DOC_INFO_MODIFICATION_DATE, info.getModificationDate());
password = getPassword(metadata, context); if (tstream != null && tstream.hasFile()) { extractMetadata(pdfDocument, metadata, context); AccessChecker checker = localConfig.getAccessChecker(); checker.check(metadata); if (handler != null) { if (shouldHandleXFAOnly(pdfDocument, localConfig)) { handleXFAOnly(pdfDocument, handler, metadata, context); } else if (localConfig.getOcrStrategy().equals(PDFPureJavaParserConfig.OCR_STRATEGY.OCR_ONLY)) { metadata.add("X-Parsed-By", "org.apache.tika.parser.ocr.TesseractOCRParser");
/** * Used when processing custom metadata entries, as PDFBox won't do * the conversion for us in the way it does for the standard ones */ private void addMetadata(Metadata metadata, String name, COSBase value) { if (value instanceof COSArray) { for (Object v : ((COSArray) value).toList()) { addMetadata(metadata, name, ((COSBase) v)); } } else if (value instanceof COSString) { addMetadata(metadata, name, ((COSString) value).getString()); } // Avoid calling COSDictionary#toString, since it can lead to infinite // recursion. See TIKA-1038 and PDFBOX-1835. else if (value != null && !(value instanceof COSDictionary)) { addMetadata(metadata, name, value.toString()); } }
addMetadata(metadata, property, pdfBoxBaseline); continue; addMetadata(metadata, property, value); if (!property.isMultiValuePermitted()) { return; addMetadata(metadata, property, pdfBoxBaseline);
private void addMetadata(Metadata metadata, Property property, String value) { if (value != null) { String decoded = decode(value); if (property.isMultiValuePermitted() || metadata.get(property) == null) { metadata.add(property, decoded); } //silently skip adding property that already exists if multiple values are not permitted } }