org.apache.tika.parser.AutoDetectParser java code examples

Refine search

public static void useAutoDetectParser() throws Exception {
  InputStream stream = new ByteArrayInputStream(new byte[0]);
  ContentHandler handler = new DefaultHandler();
  Metadata metadata = new Metadata();
  ParseContext context = new ParseContext();
  Parser parser = new AutoDetectParser();
  parser.parse(stream, handler, metadata, context);
}

private Map<String, String> tika_parse(InputStream sourceStream, String prefix, Integer maxAttribs,
                    Integer maxAttribLen) throws IOException, TikaException, SAXException {
  final Metadata metadata = new Metadata();
  final TikaInputStream tikaInputStream = TikaInputStream.get(sourceStream);
  try {
    autoDetectParser.parse(tikaInputStream, new DefaultHandler(), metadata);
  } finally {
    tikaInputStream.close();
  final Pattern metadataKeyFilter = metadataKeyFilterRef.get();
  final StringBuilder dataBuilder = new StringBuilder();
  for (final String key : metadata.names()) {
    if (metadataKeyFilter != null && !metadataKeyFilter.matcher(key).matches()) {
      continue;
    if (metadata.isMultiValued(key)) {
      for (String val : metadata.getValues(key)) {
        if (dataBuilder.length() > 1) {

 try (InputStream is = theInputStream;
    BufferedInputStream bis = new BufferedInputStream(is);) {
  AutoDetectParser parser = new AutoDetectParser();
  Detector detector = parser.getDetector();
  Metadata md = new Metadata();
  md.add(Metadata.RESOURCE_NAME_KEY, theFileName);
  MediaType mediaType = detector.detect(bis, md);
  return mediaType.toString();
}

/**
 * Example of extracting the plain text of the contents.
 * Will return only the "body" part of the document
 */
public String parseToPlainText() throws IOException, SAXException, TikaException {
  BodyContentHandler handler = new BodyContentHandler();
  AutoDetectParser parser = new AutoDetectParser();
  Metadata metadata = new Metadata();
  try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {
    parser.parse(stream, handler, metadata);
    return handler.toString();
  }
}

public static String parseUsingAutoDetect(String filename, TikaConfig tikaConfig,
                     Metadata metadata) throws Exception {
  System.out.println("Handling using AutoDetectParser: [" + filename + "]");
  AutoDetectParser parser = new AutoDetectParser(tikaConfig);
  ContentHandler handler = new BodyContentHandler();
  TikaInputStream stream = TikaInputStream.get(new File(filename), metadata);
  parser.parse(stream, handler, metadata, new ParseContext());
  return handler.toString();
}

/**
 * Example of extracting the contents as HTML, as a string.
 */
public String parseToHTML() throws IOException, SAXException, TikaException {
  ContentHandler handler = new ToXMLContentHandler();
  AutoDetectParser parser = new AutoDetectParser();
  Metadata metadata = new Metadata();
  try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {
    parser.parse(stream, handler, metadata);
    return handler.toString();
  }
}

 public static void main(String[] args) throws Exception {
  File file = new File("/Users/jason/docstore/example_received_regular.msg");
  AutoDetectParser parser = new AutoDetectParser();
  BodyContentHandler handler = new BodyContentHandler(-1);
  Metadata tikaMetadata = new Metadata();
  InputStream input = TikaInputStream.get(file, tikaMetadata);
  parser.parse(input, handler, tikaMetadata, new ParseContext());
  String[] names = tikaMetadata.names();
  Arrays.sort(names);
  for (String name : names) {
    System.out.println(name + ": " + tikaMetadata.get(name));
  }
}

@Test
public void testRPWWithEmbeddedNPE() throws Exception {
  Parser parser = new AutoDetectParser();
  RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser);
  RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
      new BasicContentHandlerFactory(
          BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 20000));
  ForkParser fork = new ForkParser(ForkParserTest.class.getClassLoader(), wrapper);
  Metadata metadata = new Metadata();
  ParseContext context = new ParseContext();
  try (InputStream is = getClass().getResourceAsStream("/test-documents/embedded_with_npe.xml")) {
    fork.parse(is, handler, metadata, context);
  } finally {
    fork.close();
  }
  List<Metadata> metadataList = handler.getMetadataList();
  Metadata m0 = metadataList.get(0);
  assertEquals("Nikolai Lobachevsky", m0.get(TikaCoreProperties.CREATOR));
  assertContains("main_content", m0.get(RecursiveParserWrapperHandler.TIKA_CONTENT));
  assertContains("embed1.xml", m0.get(RecursiveParserWrapperHandler.TIKA_CONTENT));
  Metadata m1 = metadataList.get(1);
  assertEquals("embeddedAuthor", m1.get(TikaCoreProperties.CREATOR));
  assertContains("some_embedded_content", m1.get(RecursiveParserWrapperHandler.TIKA_CONTENT));
  assertEquals("/embed1.xml", m1.get(RecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH));
  assertContains("another null pointer exception", m1.get(RecursiveParserWrapperHandler.EMBEDDED_EXCEPTION));
}

  public void listMetadata(File f) throws MapperException {
    try {
      InputStream stream = new FileInputStream(f);
      Metadata metadata = new Metadata();
      ContentHandler handler = new DefaultHandler();
      Parser parser = new AutoDetectParser(); 
      ParseContext context = new ParseContext();
      parser.parse(stream, handler, metadata, context);

      for (String key : metadata.names()) {
        String val = metadata.get(key);
        LOG.info("Found metadata \'" + key + "\': " + val);      
      }

    } catch (Exception e) {
      LOG.error(e.toString(), e);
      throw new MapperException("Extracting metadata failed, file not found: " + f.getAbsolutePath(), e);
    }
  } 
}

File file = new File("/pats/to/file.xls");
AutoDetectParser parser = new AutoDetectParser();
parser.setParsers(new HashMap<MediaType, Parser>());
Metadata metadata = new Metadata();
metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, file.getName());
parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
stream.close();
String mimeType = metadata.get(HttpHeaders.CONTENT_TYPE);
System.out.println(mimeType);

@Override
public Single<Map<String, String>> getMetadata(InputStream ins) {
  return Single.create(sub -> {
    Parser parser = new AutoDetectParser();
    BodyContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();
    try {
      parser.parse(ins, handler, metadata, context);
      Map<String, String> map = new HashMap<>();
      String[] metadataNames = metadata.names();
      for (String name : metadataNames) {
        map.put(name, metadata.get(name));
      }
      sub.onSuccess(map);
    } catch (Exception e) {
      sub.onError(e);
    }
    // ins.close();
  });
}

 @ProcessElement
 public void processElement(ProcessContext c) throws Exception {
  ReadableFile file = c.element();
  InputStream stream = Channels.newInputStream(file.open());
  try (InputStream tikaStream = TikaInputStream.get(stream)) {
   Parser parser =
     tikaConfig == null ? new AutoDetectParser() : new AutoDetectParser(tikaConfig);
   ParseContext context = new ParseContext();
   context.set(Parser.class, parser);
   Metadata tikaMetadata =
     spec.getInputMetadata() != null ? spec.getInputMetadata() : new Metadata();
   if (spec.getContentTypeHint() != null) {
    tikaMetadata.set(Metadata.CONTENT_TYPE, spec.getContentTypeHint());
   }
   String location = file.getMetadata().resourceId().toString();
   ParseResult res;
   ContentHandler tikaHandler = new ToTextContentHandler();
   try {
    parser.parse(tikaStream, tikaHandler, tikaMetadata, context);
    res = ParseResult.success(location, tikaHandler.toString(), tikaMetadata);
   } catch (Exception e) {
    res = ParseResult.failure(location, tikaHandler.toString(), tikaMetadata, e);
   }
   c.output(res);
  }
 }
}

public static String extractMeta(String uri, String contentType) throws Exception {
  final AutoDetectParser parser = createParser();
  final Metadata metadata = new Metadata();
  fillMetadata(parser, metadata, contentType, uri);
  final TikaInputStream inputStream = createInputStream(uri, metadata);
  parser.parse(inputStream, new DefaultHandler(), metadata);
  Map meta = new HashMap();
  for (String name : metadata.names()) {
    String[] values = metadata.getValues(name);
    meta.put(name, values);
  }
  inputStream.close();
  return new Gson().toJson(meta);
}

};
AutoDetectParser ap = new AutoDetectParser();
for (String fileBase : testFiles)
  Metadata metadata = new Metadata();
  metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
  MediaType mt = ap.getDetector().detect(TikaInputStream.get(file), metadata);
  String mimetype = mt.toString();

if(currentImageType ==null){
       ByteArrayInputStream is = new ByteArrayInputStream(image);
       String mimeType = URLConnection.guessContentTypeFromStream(is);
       if(mimeType == null){
         AutoDetectParser parser = new AutoDetectParser();
         Detector detector = parser.getDetector();
         Metadata md = new Metadata();
         mimeType = detector.detect(is,md).toString();
         if (mimeType.contains("pdf")){
           mimeType ="pdf";
         }
         else if(mimeType.contains("tif")||mimeType.contains("tiff")){
           mimeType = "tif";
         }
       }
       if(mimeType.contains("png")){
         mimeType ="png";
       }
       else if( mimeType.contains("jpg")||mimeType.contains("jpeg")){
         mimeType = "jpg";
       }
       else if (mimeType.contains("pdf")){
         mimeType ="pdf";
       }
       else if(mimeType.contains("tif")||mimeType.contains("tiff")){
         mimeType = "tif";
       }
       currentImageType = ImageType.fromValue(mimeType);
     }

  public static void languageDetectionWithHandler() throws Exception {
    LanguageHandler handler = new LanguageHandler();
    new AutoDetectParser().parse(System.in, handler, new Metadata(), new ParseContext());

    LanguageResult result = handler.getLanguage();
    System.out.println(result.getLanguage());
  }
}

public void parse(
    InputStream stream, ContentHandler handler, Metadata metadata)
    throws IOException, SAXException, TikaException {
  ParseContext context = new ParseContext();
  context.set(Parser.class, this);
  parse(stream, handler, metadata, context);
}

/**
 * Creates a reader for the text content of the given binary stream
 * with the given name.
 *
 * @param stream binary stream
 * @param name document name
 * @throws IOException if the document can not be parsed
 */
public ParsingReader(InputStream stream, String name) throws IOException {
  this(new AutoDetectParser(), stream, getMetadata(name), new ParseContext());
  context.set(Parser.class, parser);
}

  /**
   * 
   * @param file office file
   * @return boolean success
   * @throws IOException a problem of file. refer to a message.
   * @throws SAXException 
   * @throws TikaException throw this, if can not parse file.
   */
  public static final String extract(File file) throws IOException, SAXException, TikaException {
    final ContentHandler handler = new BodyContentHandler(-1);//infinity -> -1, object will be gone soon
    final Metadata metadata = new Metadata(); // only 1-run 1-use, object will be gone soon
    try (InputStream input = new FileInputStream(file)) {
      TikaTextExtractor.parser.parse(input, handler, metadata);
    }
    return handler.toString();
  }
}

  protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception {
  Parser p = new AutoDetectParser();
  RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);
  RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
      new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
  try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
    wrapper.parse(is, handler, new Metadata(), context);
  }
  return handler.getMetadataList();
}

Most used methods

<init>
Creates an auto-detecting parser instance using the specified set of parser. This allows one to crea
parse
getDetector
Returns the type detector used by this parser to auto-detect the type of a document.
setDetector
Sets the type detector used by this parser to auto-detect the type of a document.
setFallback
getMediaTypeRegistry
getParsers
getSupportedTypes
setParsers

Popular in Java

Reactive rest calls using spring rest template
getSystemService (Context)
setScale (BigDecimal)
getExternalFilesDir (Context)
File (java.io)
An "abstract" representation of a file system entity identified by a pathname. The pathname may be a
System (java.lang)
Provides access to system-related information and resources including standard input and output. Ena
Thread (java.lang)
A thread is a thread of execution in a program. The Java Virtual Machine allows an application to ha
Collections (java.util)
This class consists exclusively of static methods that operate on or return collections. It contains
Date (java.util)
A specific moment in time, with millisecond precision. Values typically come from System#currentTime
IOUtils (org.apache.commons.io)
General IO stream manipulation utilities. This class provides static utility methods for input/outpu
Top plugins for Android Studio

How to useAutoDetectParser in org.apache.tika.parser

Best Java code snippets using org.apache.tika.parser.AutoDetectParser (Showing top 20 results out of 441)

Refine search

How to use
AutoDetectParser
in
org.apache.tika.parser