/** * Returns the parser instance to which parsing tasks should be delegated. * The default implementation looks up the delegate parser from the given * parse context, and uses an {@link EmptyParser} instance as a fallback. * Subclasses can override this method to implement alternative delegation * strategies. * * @since Apache Tika 0.7 * @param context parse context * @return delegate parser */ protected Parser getDelegateParser(ParseContext context) { return context.get(Parser.class, EmptyParser.INSTANCE); }
public PasswordProvider getPasswordProvider() { return context.get(PasswordProvider.class); }
/** * Returns the object in this context that implements the given interface, * or the given default value if such an object is not found. * * @param key the interface implemented by the requested object * @param defaultValue value to return if the requested object is not found * @return the object that implements the given interface, * or the given default value if not found */ public <T> T get(Class<T> key, T defaultValue) { T value = get(key); if (value != null) { return value; } else { return defaultValue; } }
@Override public Set<MediaType> getSupportedTypes(ParseContext context) { // If Tesseract is installed, offer our supported image types TesseractOCRConfig config = context.get(TesseractOCRConfig.class, defaultConfig); if (hasTesseract(config)) { return SUPPORTED_TYPES; } // Otherwise don't advertise anything, so the other image parsers // can be selected instead return Collections.emptySet(); }
protected AbstractPOIFSExtractor(ParseContext context, Metadata parentMetadata) { embeddedDocumentUtil = new EmbeddedDocumentUtil(context); this.passwordProvider = context.get(PasswordProvider.class); this.officeParserConfig = context.get(OfficeParserConfig.class, new OfficeParserConfig()); this.parentMetadata = parentMetadata; this.context = context; }
public AbstractOOXMLExtractor(ParseContext context, POIXMLTextExtractor extractor) { this.context = context; this.extractor = extractor; embeddedExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); // This has already been set by OOXMLParser's call to configure() // We can rely on this being non-null. this.config = context.get(OfficeParserConfig.class); }
/** * Checks to see if the user has specified an {@link OfficeParserConfig}. * If so, no changes are made; if not, one is added to the context. * * @param parseContext */ public void configure(ParseContext parseContext) { OfficeParserConfig officeParserConfig = parseContext.get(OfficeParserConfig.class, defaultOfficeParserConfig); parseContext.set(OfficeParserConfig.class, officeParserConfig); }
/** * Look for an EncodingDetetor in the ParseContext. If it hasn't been * passed in, use the original EncodingDetector from initialization. * * @param parseContext * @return */ protected EncodingDetector getEncodingDetector(ParseContext parseContext) { EncodingDetector fromParseContext = parseContext.get(EncodingDetector.class); if (fromParseContext != null) { return fromParseContext; } return getEncodingDetector(); }
public boolean shouldParseEmbedded(Metadata metadata) { DocumentSelector selector = context.get(DocumentSelector.class); if (selector != null) { return selector.select(metadata); } FilenameFilter filter = context.get(FilenameFilter.class); if (filter != null) { String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); if (name != null) { return filter.accept(ABSTRACT_PATH, name); } } return true; }
/** * @return Returns a {@link TikaConfig} -- trying to find it first in the ParseContext * that was included during initialization, and then creating a new one from * via {@link TikaConfig#getDefaultConfig()} if it can't find one in the * ParseContext. This caches the default config so that it only has to be created once. */ public TikaConfig getTikaConfig() { //be as lazy as possible and cache the TikaConfig if (tikaConfig == null) { tikaConfig = context.get(TikaConfig.class); if (tikaConfig == null) { tikaConfig = TikaConfig.getDefaultConfig(); } } return tikaConfig; }
/** * @return Returns a {@link TikaConfig} -- trying to find it first in the ParseContext * that was included in the initialization, and then creating a new one from * via {@link TikaConfig#getDefaultConfig()} if it can't find one in the * ParseContext. * * @deprecated as of 1.17, use {@link #getTikaConfig()} instead */ @Deprecated public TikaConfig getConfig() { TikaConfig config = context.get(TikaConfig.class); if (config == null) { config = TikaConfig.getDefaultConfig(); } return config; }
public Detector getDetector() { //be as lazy as possible and cache Detector localDetector = context.get(Detector.class); if (localDetector != null) { return localDetector; } if (detector != null) { return detector; } detector = getTikaConfig().getDetector(); return detector; }
public MimeTypes getMimeTypes() { MimeTypes localMimeTypes = context.get(MimeTypes.class); //be as lazy as possible and cache the mimeTypes if (localMimeTypes != null) { return localMimeTypes; } if (mimeTypes != null) { return mimeTypes; } mimeTypes = getTikaConfig().getMimeRepository(); return mimeTypes; }
private void extractInlineImagesFromPDFs() { if (configFilePath == null && context.get(PDFParserConfig.class) == null) { PDFParserConfig pdfParserConfig = new PDFParserConfig(); pdfParserConfig.setExtractInlineImages(true); String warn = "As a convenience, TikaCLI has turned on extraction of\n" + "inline images for the PDFParser (TIKA-2374).\n" + "Aside from the -z option, this is not the default behavior\n"+ "in Tika generally or in tika-server."; LOG.info(warn); context.set(PDFParserConfig.class, pdfParserConfig); } }
@Override protected void parserPrepare(Parser parser, Metadata metadata, ParseContext context) { super.parserPrepare(parser, metadata, context); // Specify which charset to try String charset = context.get(CharsetTester.class).getNextCharset(); Charset charsetCS = Charset.forName(charset); context.set(EncodingDetector.class, new NonDetectingEncodingDetector(charsetCS)); }
@Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { CTAKESConfig config = context.get(CTAKESConfig.class, new CTAKESConfig()); CTAKESContentHandler ctakesHandler = new CTAKESContentHandler(handler, metadata, config); super.parse(stream, ctakesHandler, metadata, context); }
public OutlookExtractor(DirectoryNode root, ParseContext context) throws TikaException { super(context); this.parseContext = context; this.extractAllAlternatives = context.get(OfficeParserConfig.class).getExtractAllAlternativesFromMSG(); try { this.msg = new MAPIMessage(root); } catch (IOException e) { throw new TikaException("Failed to parse Outlook message", e); } }
protected EmbeddedDocumentExtractor getEmbeddedDocumentExtractor(ParseContext context) { EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class); if (extractor == null) { Parser p = context.get(Parser.class); if (p == null) { context.set(Parser.class, new MockParser()); } extractor = new ParsingEmbeddedDocumentExtractor(context); } return extractor; }
void handleMacros(PackagePart macroPart, ContentHandler handler) throws TikaException, SAXException { OfficeParserConfig officeParserConfig = context.get(OfficeParserConfig.class); if (officeParserConfig.getExtractMacros()) { try (InputStream is = macroPart.getInputStream()) { try (POIFSFileSystem poifs = new POIFSFileSystem(is)) { //Macro reading exceptions are already swallowed here OfficeParser.extractMacros(poifs, handler, embeddedExtractor); } } catch (IOException e) { throw new TikaException("Broken OOXML file", e); } } }
public static void parseInvestigation(InputStream stream, XHTMLContentHandler handler, Metadata metadata, ParseContext context, String studyFileName) throws IOException, TikaException, SAXException { TikaConfig tikaConfig = context.get(TikaConfig.class); if (tikaConfig == null) { tikaConfig = TikaConfig.getDefaultConfig(); } // Automatically detect the character encoding try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, tikaConfig.getEncodingDetector())) { extractMetadata(reader, metadata, studyFileName); } }