com.norconex.commons.lang.file.ContentType java code examples

/**
 * Gets a parser based on content type, regardless of document reference
 * (ignoring it).
 * All parsers are assumed to have been configured properly
 * before the first call to this method.
 */
@Override
public final IDocumentParser getParser(
    String documentReference, ContentType contentType) {
  // If ignoring content-type, do not even return a parser
  if (contentType != null 
      && StringUtils.isNotBlank(ignoredContentTypesRegex)
      && contentType.toString().matches(ignoredContentTypesRegex)) {
    return null;
  }
  
  ensureParseHintsState();
  IDocumentParser parser = parsers.get(contentType);
  if (parser == null) {
    return fallbackParser;
  }
  return parser;
}

protected void initDefaultParsers() {
  // Fallback parser
  fallbackParser = new FallbackParser();
  //TODO delete when released in Tika:
  //https://issues.apache.org/jira/browse/TIKA-2222       
  // PureEdge XFDL
  parsers.put(
      ContentType.valueOf("application/vnd.xfdl"), new XFDLParser());
}

ContentType ct = doc.getContentType();
if (ct != null) {
  ext = ct.getExtension();

    || StringUtils.isBlank(safeContentType.toString())) {
  try {
    safeContentType = 
        + "\"application/octet-stream\".", e);
    safeContentType = 
        ContentType.valueOf("application/octet-stream");
    safeContentType.toString()); 
ContentFamily contentFamily = 
    ContentFamily.forContentType(safeContentType.toString());
if (contentFamily != null) {
  meta.setString(ImporterMetadata.DOC_CONTENT_FAMILY,

ContentType ct = ContentType.valueOf(name);
if (ct != null) {
  embedMeta.setEmbeddedType("file-object");
  return "embedded-" + embedCount + "." + ct.getExtension();

@Override
public int hashCode() {
  int hash = new HashCodeBuilder()
      .append(ignoredContentTypesRegex)
      .append(parseHints)
      .append(parsersAreUpToDate)
      .append(parsers.size())
      .toHashCode();
  hash += fallbackParser.hashCode();
  for (Entry<ContentType, IDocumentParser> entry : parsers.entrySet()) {
    ContentType ct = entry.getKey();
    hash += ct.hashCode();
    IDocumentParser parser = entry.getValue();
    if (parser == null) {
      continue;
    }
    hash += parser.hashCode();
  }
  return hash;
}

/**
 * Detects the content type from the given input stream.
 * @param content the content on which to detect content type
 * @return the detected content type
 * @throws IOException problem detecting content type
 */
public ContentType detect(InputStream content)
    throws IOException {
  Tika tika = new Tika();
  String contentType = tika.detect(content);
  if (LOG.isDebugEnabled()) {
    LOG.debug("Detected \"" + contentType
        + "\" content-type for input stream.");
  }
  return ContentType.valueOf(contentType);
}
/**

  parserXML = parserXML.replaceFirst("^<parser", 
      "<parser contentType=\"" 
          + ct.toString() + "\"");
xml.writeAttributeString("class", 
    parser.getClass().getCanonicalName());
xml.writeAttributeString("contentType", ct.toString());
xml.writeEndElement();

    ContentType.valueOf(cmd.getOptionValue(ARG_CONTENTTYPE));
String contentEncoding = cmd.getOptionValue(ARG_CONTENTENCODING);
String output = cmd.getOptionValue(ARG_OUTPUTFILE);

@Override
public Object[] getInsertCrawlDataValues(
    String table, ICrawlData crawlData) {
  String contentType = null;
  if (crawlData.getContentType() != null) {
    contentType = crawlData.getContentType().toString();
  }
  long crawlDate = 0;
  if (crawlData.getCrawlDate() != null) {
    crawlDate = crawlData.getCrawlDate().getTime();
  }
  return new Object[] { 
      crawlData.getReference(),
      crawlData.getParentRootReference(),
      crawlData.isRootParentReference(),
      crawlData.getState().toString(),
      crawlData.getMetaChecksum(),
      crawlData.getContentChecksum(),
      contentType,
      crawlDate
  };
}

  private ContentType doDetect(
      InputStream is, String fileName) throws IOException {
    try (TikaInputStream tikaStream = TikaInputStream.get(is)) {
      Metadata meta = new Metadata();
      String extension = extPattern.matcher(fileName).replaceFirst("$1");
      meta.set(Metadata.RESOURCE_NAME_KEY, "file:///detect" + extension);
      MediaType media = 
          getTikaConfig().getDetector().detect(tikaStream, meta);
      
      if (LOG.isDebugEnabled()) {
        LOG.debug("Detected \"" + media.toString()
            + "\" content-type for: " + fileName);
      }
      return ContentType.valueOf(media.toString());
    }
  }
}

@Override
public Document toDocument(Stage stage, ICrawlData crawlData) {
  Document doc = new Document();
  
  // "reference" is a Mongo indexed field, which is limited to 1024.
  // So if too long we truncate it, trying to keep it unique,
  // while storing the original in a separate field.
  String ref = StringUtil.truncateWithHash(
      crawlData.getReference(), 1024, "!");
  doc.put(FIELD_REFERENCE, ref);
  if (!Objects.equals(ref, crawlData.getReference())) {
    doc.put(FIELD_REFERENCE_EXCESSIVE, crawlData.getReference());
  }
  
  doc.put(FIELD_PARENT_ROOT_REFERENCE, 
      crawlData.getParentRootReference());
  doc.put(FIELD_IS_ROOT_PARENT_REFERENCE, 
      crawlData.isRootParentReference());
  doc.put(FIELD_CRAWL_STATE, crawlData.getState().toString());
  doc.put(FIELD_META_CHECKSUM, crawlData.getMetaChecksum());
  doc.put(FIELD_CONTENT_CHECKSUM, crawlData.getContentChecksum());
  doc.put(FIELD_IS_VALID, crawlData.getState().isGoodState());
  doc.put(FIELD_STAGE, stage.toString());
  if (crawlData.getContentType() != null) {
    doc.put(FIELD_CONTENT_TYPE, crawlData.getContentType().toString());
  }
  doc.put(FIELD_CRAWL_DATE, crawlData.getCrawlDate());
  return doc;
}

  @Override
  public ICrawlData toCrawlData(String table, ResultSet rs)
      throws SQLException {
    if (rs == null) {
      return null;
    }
    BaseCrawlData data = new BaseCrawlData();
    data.setReference(rs.getString("reference"));
    data.setParentRootReference(rs.getString("parentRootReference"));
    data.setRootParentReference(rs.getBoolean("isRootParentReference"));
    data.setState(CrawlState.valueOf(rs.getString("state")));
    data.setMetaChecksum(rs.getString("metaChecksum"));
    data.setContentChecksum(rs.getString("contentChecksum"));
    String contentType = rs.getString("contentType");
    if (StringUtils.isNoneBlank(contentType)) {
      data.setContentType(ContentType.valueOf(contentType));
    }
    long crawlDate = rs.getLong("crawlDate");
    if (crawlDate > 0) {
      data.setCrawlDate(new Date(crawlDate));
    }
    return data;
  }
}

      "ImporterDocument must have a content-type.");
String contentType = doc.getContentType().toString();

        ImporterMetadata.DOC_CONTENT_TYPE);
if (StringUtils.isNotBlank(ct)) {
  doc.setContentType(ContentType.valueOf(ct));

@Override
public void loadFromXML(Reader in) throws IOException {
  XMLConfiguration xml = XMLConfigurationUtil.newXMLConfiguration(in);
  setIgnoredContentTypesRegex(xml.getString(
      "ignoredContentTypes", getIgnoredContentTypesRegex()));
  // Parse hints
  loadParseHintsFromXML(xml);
  
  // Fallback parser
  fallbackParser = XMLConfigurationUtil.newInstance(
      xml, "fallbackParser", fallbackParser);
  
  // Parsers
  List<HierarchicalConfiguration> parserNodes = 
      xml.configurationsAt("parsers.parser");
  for (HierarchicalConfiguration node : parserNodes) {
    IDocumentParser parser = XMLConfigurationUtil.newInstance(node);
    String contentType = node.getString("[@contentType]");
    if (StringUtils.isBlank(contentType)) {
      throw new ConfigurationException(
          "Attribute \"contentType\" missing for parser: "
         + node.getString("[@class]"));
    }
    parsers.put(ContentType.valueOf(contentType), parser);
  }
}
private void loadParseHintsFromXML(XMLConfiguration xml) {

String contentType = (String) doc.get(FIELD_CONTENT_TYPE);
if (StringUtils.isNotBlank(contentType)) {
  data.setContentType(ContentType.valueOf(contentType));

Most used methods

Popular in Java

Reading from database using SQL prepared statement
getContentResolver (Context)
compareTo (BigDecimal)
getResourceAsStream (ClassLoader)
UnknownHostException (java.net)
Thrown when a hostname can not be resolved.
ByteBuffer (java.nio)
A buffer for bytes. A byte buffer can be created in either one of the following ways: * #allocate
DateFormat (java.text)
Formats or parses dates and times.This class provides factories for obtaining instances configured f
HashMap (java.util)
HashMap is an implementation of Map. All optional operations are supported.All elements are permitte
Container (java.awt)
A generic Abstract Window Toolkit(AWT) container object is a component that can contain other AWT co
Reference (javax.naming)
Top Sublime Text plugins

How to useContentType in com.norconex.commons.lang.file

Best Java code snippets using com.norconex.commons.lang.file.ContentType (Showing top 17 results out of 315)

How to use
ContentType
in
com.norconex.commons.lang.file