private String chooseEncoding(Page page, Metadata metadata) { String pageCharset = page.getContentCharset(); if (pageCharset == null || pageCharset.isEmpty()) { return metadata.get("Content-Encoding"); } return pageCharset; } }
public static String probeContentType(final InputStream is, final String name) { try (InputStream stream = new BufferedInputStream(is)) { final Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, name); return getDefaultMimeTypes().detect(stream, metadata).toString(); } catch (IOException e) { LOGGER.warn("Couldn't detect the media type of attachment {} {}", name, e); return WILDCARD; } }
public void setBinaryContent(byte[] data) throws TransformerConfigurationException, TikaException, SAXException, IOException { InputStream inputStream = new ByteArrayInputStream(data); ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); try { TransformerHandler handler = getTransformerHandler(outputStream, DEFAULT_OUTPUT_FORMAT, DEFAULT_ENCODING); AUTO_DETECT_PARSER.parse(inputStream, handler, new Metadata(), context); // Hacking the following line to remove Tika's inserted DocType this.html = new String(outputStream.toByteArray(), DEFAULT_ENCODING).replace( "http://www.w3.org/1999/xhtml", ""); } catch (TransformerConfigurationException | TikaException | SAXException | IOException | RuntimeException e) { throw e; } }
private Map<String, String> tika_parse(InputStream sourceStream, String prefix, Integer maxAttribs, Integer maxAttribLen) throws IOException, TikaException, SAXException { final Metadata metadata = new Metadata(); final TikaInputStream tikaInputStream = TikaInputStream.get(sourceStream); try { final Pattern metadataKeyFilter = metadataKeyFilterRef.get(); final StringBuilder dataBuilder = new StringBuilder(); for (final String key : metadata.names()) { if (metadataKeyFilter != null && !metadataKeyFilter.matcher(key).matches()) { continue; if (metadata.isMultiValued(key)) { for (String val : metadata.getValues(key)) { if (dataBuilder.length() > 1) { dataBuilder.append(", "); dataBuilder.append(metadata.get(key));
@Override public void process(final InputStream stream) throws IOException { try (final InputStream in = new BufferedInputStream(stream)) { TikaInputStream tikaStream = TikaInputStream.get(in); Metadata metadata = new Metadata(); if (filename != null && context.getProperty(USE_FILENAME_IN_DETECTION).asBoolean()) { metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, filename); } // Get mime type MediaType mediatype = detector.detect(tikaStream, metadata); mimeTypeRef.set(mediatype.toString()); } } });
/** * @see org.apache.tika.xmp.XMPMetadata#set(java.lang.String, java.lang.String) */ @Override public void set(Property property, Date date) { super.set( property, date ); }
private void setupMetadata(Metadata metadata) { // simple property metadata.set( TikaCoreProperties.FORMAT, GENERIC_MIMETYPE ); // language alternative metadata.set( TikaCoreProperties.TITLE, "title" ); // array metadata.set( TikaCoreProperties.SUBJECT, new String[] { "keyword1", "keyword2" } ); // date metadata.set( TikaCoreProperties.MODIFIED, "2001-01-01T01:01" ); // int simple property metadata.set( Property.internalInteger( "xmp:Integer" ), "2" ); }
private void add(Property property, String value) { if (value != null) { metadata.add(property, value); } }
/** * Get the values associated to a metadata name. * * @param property * of the metadata. * @return the values associated to a metadata name. */ public String[] getValues(final Property property) { return _getValues(property.getName()); }
final Metadata metadata = new Metadata();
private void addMetadataByProperty( Metadata metadata, Property property, String value ) { // Add metadata if an appropriate value is passed if (value != null) { metadata.set( property, value ); } }
String getTime(Metadata m) { String elapsed = "-1"; String v = m.get(AbstractRecursiveParserWrapperHandler.PARSE_TIME_MILLIS); if (v != null) { return v; } return elapsed; }
InputStream input = new FileInputStream("myfile.html"); ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); new HtmlParser().parse(input, handler, metadata, new ParseContext()); String plainText = handler.toString();
private void setProperty(Metadata metadata, Property property, int value) { if (value > 0) { metadata.set(property, value); } }
Metadata metadata = new Metadata(); MediaType mediaType = MediaType.OCTET_STREAM; try {
private void set(Property property, int value) { if (value > 0) { metadata.set(property, value); } }
private void setupOOXMLMetadata(Metadata metadata) { // simple property metadata.set( TikaCoreProperties.LANGUAGE, "language" ); // language alternative metadata.set( TikaCoreProperties.TITLE, "title" ); // comma separated array metadata.set( TikaCoreProperties.SUBJECT, "keyword1,keyword2" ); // OOXML specific simple prop metadata.set( TikaCoreProperties.MODIFIER, "lastModifiedBy" ); }