/** * Gets a parser based on content type, regardless of document reference * (ignoring it). * All parsers are assumed to have been configured properly * before the first call to this method. */ @Override public final IDocumentParser getParser( String documentReference, ContentType contentType) { // If ignoring content-type, do not even return a parser if (contentType != null && StringUtils.isNotBlank(ignoredContentTypesRegex) && contentType.toString().matches(ignoredContentTypesRegex)) { return null; } ensureParseHintsState(); IDocumentParser parser = parsers.get(contentType); if (parser == null) { return fallbackParser; } return parser; }
protected void initDefaultParsers() { // Fallback parser fallbackParser = new FallbackParser(); //TODO delete when released in Tika: //https://issues.apache.org/jira/browse/TIKA-2222 // PureEdge XFDL parsers.put( ContentType.valueOf("application/vnd.xfdl"), new XFDLParser()); }
ContentType ct = doc.getContentType(); if (ct != null) { ext = ct.getExtension();
|| StringUtils.isBlank(safeContentType.toString())) { try { safeContentType = + "\"application/octet-stream\".", e); safeContentType = ContentType.valueOf("application/octet-stream"); safeContentType.toString()); ContentFamily contentFamily = ContentFamily.forContentType(safeContentType.toString()); if (contentFamily != null) { meta.setString(ImporterMetadata.DOC_CONTENT_FAMILY,
ContentType ct = ContentType.valueOf(name); if (ct != null) { embedMeta.setEmbeddedType("file-object"); return "embedded-" + embedCount + "." + ct.getExtension();
@Override public int hashCode() { int hash = new HashCodeBuilder() .append(ignoredContentTypesRegex) .append(parseHints) .append(parsersAreUpToDate) .append(parsers.size()) .toHashCode(); hash += fallbackParser.hashCode(); for (Entry<ContentType, IDocumentParser> entry : parsers.entrySet()) { ContentType ct = entry.getKey(); hash += ct.hashCode(); IDocumentParser parser = entry.getValue(); if (parser == null) { continue; } hash += parser.hashCode(); } return hash; }
/** * Detects the content type from the given input stream. * @param content the content on which to detect content type * @return the detected content type * @throws IOException problem detecting content type */ public ContentType detect(InputStream content) throws IOException { Tika tika = new Tika(); String contentType = tika.detect(content); if (LOG.isDebugEnabled()) { LOG.debug("Detected \"" + contentType + "\" content-type for input stream."); } return ContentType.valueOf(contentType); } /**
ContentType.valueOf(cmd.getOptionValue(ARG_CONTENTTYPE)); String contentEncoding = cmd.getOptionValue(ARG_CONTENTENCODING); String output = cmd.getOptionValue(ARG_OUTPUTFILE);
@Override public Object[] getInsertCrawlDataValues( String table, ICrawlData crawlData) { String contentType = null; if (crawlData.getContentType() != null) { contentType = crawlData.getContentType().toString(); } long crawlDate = 0; if (crawlData.getCrawlDate() != null) { crawlDate = crawlData.getCrawlDate().getTime(); } return new Object[] { crawlData.getReference(), crawlData.getParentRootReference(), crawlData.isRootParentReference(), crawlData.getState().toString(), crawlData.getMetaChecksum(), crawlData.getContentChecksum(), contentType, crawlDate }; }
private ContentType doDetect( InputStream is, String fileName) throws IOException { try (TikaInputStream tikaStream = TikaInputStream.get(is)) { Metadata meta = new Metadata(); String extension = extPattern.matcher(fileName).replaceFirst("$1"); meta.set(Metadata.RESOURCE_NAME_KEY, "file:///detect" + extension); MediaType media = getTikaConfig().getDetector().detect(tikaStream, meta); if (LOG.isDebugEnabled()) { LOG.debug("Detected \"" + media.toString() + "\" content-type for: " + fileName); } return ContentType.valueOf(media.toString()); } } }
@Override public Document toDocument(Stage stage, ICrawlData crawlData) { Document doc = new Document(); // "reference" is a Mongo indexed field, which is limited to 1024. // So if too long we truncate it, trying to keep it unique, // while storing the original in a separate field. String ref = StringUtil.truncateWithHash( crawlData.getReference(), 1024, "!"); doc.put(FIELD_REFERENCE, ref); if (!Objects.equals(ref, crawlData.getReference())) { doc.put(FIELD_REFERENCE_EXCESSIVE, crawlData.getReference()); } doc.put(FIELD_PARENT_ROOT_REFERENCE, crawlData.getParentRootReference()); doc.put(FIELD_IS_ROOT_PARENT_REFERENCE, crawlData.isRootParentReference()); doc.put(FIELD_CRAWL_STATE, crawlData.getState().toString()); doc.put(FIELD_META_CHECKSUM, crawlData.getMetaChecksum()); doc.put(FIELD_CONTENT_CHECKSUM, crawlData.getContentChecksum()); doc.put(FIELD_IS_VALID, crawlData.getState().isGoodState()); doc.put(FIELD_STAGE, stage.toString()); if (crawlData.getContentType() != null) { doc.put(FIELD_CONTENT_TYPE, crawlData.getContentType().toString()); } doc.put(FIELD_CRAWL_DATE, crawlData.getCrawlDate()); return doc; }
@Override public ICrawlData toCrawlData(String table, ResultSet rs) throws SQLException { if (rs == null) { return null; } BaseCrawlData data = new BaseCrawlData(); data.setReference(rs.getString("reference")); data.setParentRootReference(rs.getString("parentRootReference")); data.setRootParentReference(rs.getBoolean("isRootParentReference")); data.setState(CrawlState.valueOf(rs.getString("state"))); data.setMetaChecksum(rs.getString("metaChecksum")); data.setContentChecksum(rs.getString("contentChecksum")); String contentType = rs.getString("contentType"); if (StringUtils.isNoneBlank(contentType)) { data.setContentType(ContentType.valueOf(contentType)); } long crawlDate = rs.getLong("crawlDate"); if (crawlDate > 0) { data.setCrawlDate(new Date(crawlDate)); } return data; } }
"ImporterDocument must have a content-type."); String contentType = doc.getContentType().toString();
ImporterMetadata.DOC_CONTENT_TYPE); if (StringUtils.isNotBlank(ct)) { doc.setContentType(ContentType.valueOf(ct));
@Override public void loadFromXML(Reader in) throws IOException { XMLConfiguration xml = XMLConfigurationUtil.newXMLConfiguration(in); setIgnoredContentTypesRegex(xml.getString( "ignoredContentTypes", getIgnoredContentTypesRegex())); // Parse hints loadParseHintsFromXML(xml); // Fallback parser fallbackParser = XMLConfigurationUtil.newInstance( xml, "fallbackParser", fallbackParser); // Parsers List<HierarchicalConfiguration> parserNodes = xml.configurationsAt("parsers.parser"); for (HierarchicalConfiguration node : parserNodes) { IDocumentParser parser = XMLConfigurationUtil.newInstance(node); String contentType = node.getString("[@contentType]"); if (StringUtils.isBlank(contentType)) { throw new ConfigurationException( "Attribute \"contentType\" missing for parser: " + node.getString("[@class]")); } parsers.put(ContentType.valueOf(contentType), parser); } } private void loadParseHintsFromXML(XMLConfiguration xml) {
String contentType = (String) doc.get(FIELD_CONTENT_TYPE); if (StringUtils.isNotBlank(contentType)) { data.setContentType(ContentType.valueOf(contentType));