/** * Gets a parser based on content type, regardless of document reference * (ignoring it). * All parsers are assumed to have been configured properly * before the first call to this method. */ @Override public final IDocumentParser getParser( String documentReference, ContentType contentType) { // If ignoring content-type, do not even return a parser if (contentType != null && StringUtils.isNotBlank(ignoredContentTypesRegex) && contentType.toString().matches(ignoredContentTypesRegex)) { return null; } ensureParseHintsState(); IDocumentParser parser = parsers.get(contentType); if (parser == null) { return fallbackParser; } return parser; }
@Override public Object[] getInsertCrawlDataValues( String table, ICrawlData crawlData) { String contentType = null; if (crawlData.getContentType() != null) { contentType = crawlData.getContentType().toString(); } long crawlDate = 0; if (crawlData.getCrawlDate() != null) { crawlDate = crawlData.getCrawlDate().getTime(); } return new Object[] { crawlData.getReference(), crawlData.getParentRootReference(), crawlData.isRootParentReference(), crawlData.getState().toString(), crawlData.getMetaChecksum(), crawlData.getContentChecksum(), contentType, crawlDate }; }
@Override public Document toDocument(Stage stage, ICrawlData crawlData) { Document doc = new Document(); // "reference" is a Mongo indexed field, which is limited to 1024. // So if too long we truncate it, trying to keep it unique, // while storing the original in a separate field. String ref = StringUtil.truncateWithHash( crawlData.getReference(), 1024, "!"); doc.put(FIELD_REFERENCE, ref); if (!Objects.equals(ref, crawlData.getReference())) { doc.put(FIELD_REFERENCE_EXCESSIVE, crawlData.getReference()); } doc.put(FIELD_PARENT_ROOT_REFERENCE, crawlData.getParentRootReference()); doc.put(FIELD_IS_ROOT_PARENT_REFERENCE, crawlData.isRootParentReference()); doc.put(FIELD_CRAWL_STATE, crawlData.getState().toString()); doc.put(FIELD_META_CHECKSUM, crawlData.getMetaChecksum()); doc.put(FIELD_CONTENT_CHECKSUM, crawlData.getContentChecksum()); doc.put(FIELD_IS_VALID, crawlData.getState().isGoodState()); doc.put(FIELD_STAGE, stage.toString()); if (crawlData.getContentType() != null) { doc.put(FIELD_CONTENT_TYPE, crawlData.getContentType().toString()); } doc.put(FIELD_CRAWL_DATE, crawlData.getCrawlDate()); return doc; }
"ImporterDocument must have a content-type."); String contentType = doc.getContentType().toString();