public SolrRecord(int defaultMaxFieldLength, HashMap<String, Integer> maxFieldLengths, String filename, ArchiveRecordHeader header) { defaultMax = defaultMaxFieldLength; maxLengths = maxFieldLengths; setField(SolrFields.ID, "exception-at-" + filename + "@" + header.getOffset()); setField(SolrFields.SOURCE_FILE, filename); setField(SolrFields.SOURCE_FILE_OFFSET, "" + header.getOffset()); setField(SolrFields.SOLR_URL, Normalisation.sanitiseWARCHeaderValue(header.getUrl())); setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_UNKNOWN); }
public SolrRecord(int defaultMaxFieldLength, HashMap<String, Integer> maxFieldLengths, String filename, ArchiveRecordHeader header) { defaultMax = defaultMaxFieldLength; maxLengths = maxFieldLengths; setField(SolrFields.ID, "exception-at-" + filename + "@" + header.getOffset()); setField(SolrFields.SOURCE_FILE, filename); setField(SolrFields.SOURCE_FILE_OFFSET, "" + header.getOffset()); setField(SolrFields.SOLR_URL, Normalisation.sanitiseWARCHeaderValue(header.getUrl())); setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_UNKNOWN); }
solr.setField(SolrFields.CONTENT_TYPE_TIKA, contentType); solr.setField(SolrFields.FULL_CONTENT_TYPE, contentType); solr.setField(SolrFields.SOLR_CONTENT_TYPE, contentType.replaceAll(";.*$", "")); solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "image"); solr.setField(SolrFields.SOLR_TYPE, "Image"); } else if (contentType.matches("^audio/.*$") || contentType.matches("^application/vnd.rn-realaudio$")) { solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "audio"); solr.setField(SolrFields.SOLR_TYPE, "Audio"); } else if (contentType.matches("^video/.*$") || contentType.matches("^application/mp4$") || contentType.matches("^application/vnd.rn-realmedia$")) { solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "video"); solr.setField(SolrFields.SOLR_TYPE, "Video"); } else if (contentType.matches("^text/htm.*$") || contentType.matches("^application/xhtml.*$")) { solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "html"); solr.setField(SolrFields.SOLR_TYPE, "Web Page"); } else if (contentType.matches("^application/pdf.*$")) { solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "pdf"); solr.setField(SolrFields.SOLR_TYPE, "Document"); } else if (contentType.matches("^.*word$")) { solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "word"); solr.setField(SolrFields.SOLR_TYPE, "Document"); } else if (contentType.matches("^.*excel$")) {
solr.setField( SolrFields.CONTENT_TYPE_TIKA, contentType ); solr.setField( SolrFields.FULL_CONTENT_TYPE, contentType ); solr.setField( SolrFields.SOLR_CONTENT_TYPE, contentType.replaceAll( ";.*$", "" ) ); solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "image" ); solr.setField(SolrFields.SOLR_TYPE, "Image"); } else if (contentType.matches("^audio/.*$") || contentType.matches("^application/vnd.rn-realaudio$")) { solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "audio" ); solr.setField(SolrFields.SOLR_TYPE, "Audio"); } else if (contentType.matches("^video/.*$") || contentType.matches("^application/mp4$") || contentType.matches("^application/vnd.rn-realmedia$")) { solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "video" ); solr.setField(SolrFields.SOLR_TYPE, "Video"); } else if (contentType.matches("^text/htm.*$") || contentType.matches("^application/xhtml.*$")) { solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "html" ); solr.setField(SolrFields.SOLR_TYPE, "Web Page"); } else if( contentType.matches( "^application/pdf.*$" ) ) { solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "pdf" ); solr.setField(SolrFields.SOLR_TYPE, "Document"); } else if( contentType.matches( "^.*word$" ) ) { solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "word" ); solr.setField(SolrFields.SOLR_TYPE, "Document"); } else if( contentType.matches( "^.*excel$" ) ) { solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "excel" );
public WctEnricher( String archiveName ) { String wctID = this.getWctTi( archiveName ); solr = SolrRecordFactory.createFactory(null).createRecord(); // Never reduces field length size solr.setField( WctFields.WCT_INSTANCE_ID, wctID ); getWctMetadata( solr ); }
if (h.getName().equalsIgnoreCase(HttpHeaders.LOCATION)){ solr.setField(SolrFields.REDIRECT_TO_NORM, Normalisation.resolveRelative(targetUrl, location));
if (h.getName().equalsIgnoreCase(HttpHeaders.LOCATION)){ solr.setField(SolrFields.REDIRECT_TO_NORM, Normalisation.resolveRelative(targetUrl, location));
public WctEnricher( String archiveName ) { String wctID = this.getWctTi( archiveName ); solr = SolrRecordFactory.createFactory(null).createRecord(); // Never reduces field length size solr.setField( WctFields.WCT_INSTANCE_ID, wctID ); getWctMetadata( solr ); }
UsableURI url = UsableURIFactory.getInstance(fullUrl); solr.setField(SolrFields.SOLR_URL_PATH, url.getPath()); solr.setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_SLASHPAGE); solr.setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_ROBOTS_TXT); } else { solr.setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_NORMAL); if (CANONICALISE_HOST) host = Normalisation.canonicaliseHost(host); solr.setField(SolrFields.SOLR_HOST, host); solr.setField(SolrFields.DOMAIN, domain); solr.setField(SolrFields.PUBLIC_SUFFIX, LinkExtractor.extractPublicSuffixFromHost(host));
UsableURI url = UsableURIFactory.getInstance(fullUrl); solr.setField(SolrFields.SOLR_URL_PATH, url.getPath()); solr.setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_SLASHPAGE); solr.setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_ROBOTS_TXT); } else { solr.setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_NORMAL); if (CANONICALISE_HOST) host = Normalisation.canonicaliseHost(host); solr.setField(SolrFields.SOLR_HOST, host); solr.setField(SolrFields.DOMAIN, domain); solr.setField(SolrFields.PUBLIC_SUFFIX, LinkExtractor.extractPublicSuffixFromHost(host));
solr.setField(SolrFields.SOLR_RECORD_TYPE, (String) header.getHeaderValue(HEADER_KEY_TYPE)); solr.setField(SolrFields.WARC_KEY_ID, (String) header.getHeaderValue(HEADER_KEY_ID)); solr.setField(SolrFields.WARC_IP, (String) header.getHeaderValue(HEADER_KEY_IP)); solr.setField(SolrFields.SOLR_RECORD_TYPE, "arc"); solr.setField( SolrFields.INSTITUTION, WARCIndexerCommand.institution ); solr.setField( SolrFields.COLLECTION, WARCIndexerCommand.collection ); solr.setField( SolrFields.COLLECTION_ID, WARCIndexerCommand.collection_id ); solr.setField(SolrFields.SOURCE_FILE, archiveName); solr.setField(SolrFields.SOURCE_FILE_OFFSET,"" + header.getOffset()); solr.setField(SolrFields.SOURCE_FILE_PATH, linuxFilePath); solr.setField(SolrFields.SOLR_URL, Normalisation.sanitiseWARCHeaderValue(header.getUrl())); if (addNormalisedURL) { solr.setField( SolrFields.SOLR_URL_NORMALISED, Normalisation.canonicaliseURL(targetUrl) ); solr.setField(SolrFields.CRAWL_DATE, formatter.format(crawlDate)); solr.setField(SolrFields.CRAWL_YEAR, getYearFromDate(crawlDate)); solr.setField(SolrFields.WAYBACK_DATE, waybackDate); solr.setField(SolrFields.SOLR_STATUS_CODE, statusCode); solr.setField(SolrFields.CONTENT_LENGTH, ""+content_length);
solr.setField(SolrFields.SOLR_RECORD_TYPE, (String) header.getHeaderValue(HEADER_KEY_TYPE)); solr.setField(SolrFields.WARC_KEY_ID, (String) header.getHeaderValue(HEADER_KEY_ID)); solr.setField(SolrFields.WARC_IP, (String) header.getHeaderValue(HEADER_KEY_IP)); solr.setField(SolrFields.SOLR_RECORD_TYPE, "arc"); solr.setField( SolrFields.INSTITUTION, WARCIndexerCommand.institution ); solr.setField( SolrFields.COLLECTION, WARCIndexerCommand.collection ); solr.setField( SolrFields.COLLECTION_ID, WARCIndexerCommand.collection_id ); solr.setField(SolrFields.SOURCE_FILE, archiveName); solr.setField(SolrFields.SOURCE_FILE_OFFSET,"" + header.getOffset()); solr.setField(SolrFields.SOURCE_FILE_PATH, header.getReaderIdentifier()); //Full path of file solr.setField(SolrFields.SOLR_URL, Normalisation.sanitiseWARCHeaderValue(header.getUrl())); if (addNormalisedURL) { solr.setField( SolrFields.SOLR_URL_NORMALISED, Normalisation.canonicaliseURL(targetUrl) ); solr.setField(SolrFields.SOLR_STATUS_CODE, statusCode); solr.setField(SolrFields.CONTENT_LENGTH, ""+content_length); solr.setField( SolrFields.ID, id ); solr.setField( SolrFields.HASH, hash ); Collections.sort(dateList); Date firstDate = dateList.get(0); solr.setField(SolrFields.CRAWL_DATE,
.getValues(FaceDetectionParser.FACE_FRAGMENT_ID).length; if (faces > 0) solr.setField(SolrFields.IMAGE_FACES_COUNT, "" + faces);
.getValues(FaceDetectionParser.FACE_FRAGMENT_ID).length; if (faces > 0) solr.setField(SolrFields.IMAGE_FACES_COUNT, "" + faces);