protected void extractAndUpdateSimpleText() { if (fulltextConfiguration.fulltextSearchDisabled) { // if fulltext search is disabled, we don't extract simple text at all return; } for (String indexName : fulltextConfiguration.indexNames) { if (!fulltextConfiguration.indexesAllSimple.contains(indexName) && fulltextConfiguration.propPathsByIndexSimple.get(indexName) == null) { // nothing to do: index not configured for simple text continue; } Set<String> includedPaths = fulltextConfiguration.indexesAllSimple.contains(indexName) ? null : fulltextConfiguration.propPathsByIndexSimple.get(indexName); Set<String> excludedPaths = fulltextConfiguration.propPathsExcludedByIndexSimple.get(indexName); // get string properties List<String> strings = new StringsExtractor().findStrings(document, includedPaths, excludedPaths); // transform to text (remove HTML and entities) // we do this here rather than in the indexing backend (Elasticsearch) because it's more efficient here // add space at beginning and end for simulated phrase search using LIKE "% foo bar %" String text = strings.stream().map(this::stringToText).collect(Collectors.joining(" ", " ", " ")); // limit size text = limitStringSize(text, fulltextConfiguration.fulltextFieldSizeLimit); String property = getFulltextPropertyName(SYSPROP_FULLTEXT_SIMPLE, indexName); for (DocumentRef docRef : docsToUpdate) { session.setDocumentSystemProp(docRef, property, text); } } }
protected void extractAndUpdateBinaryText() { // we extract binary text even if fulltext search is disabled, // because it is still used to inject into external indexers like Elasticsearch BlobsExtractor blobsExtractor = new BlobsExtractor(); Map<Blob, String> blobsText = new IdentityHashMap<>(); for (String indexName : fulltextConfiguration.indexNames) { if (!fulltextConfiguration.indexesAllBinary.contains(indexName) && fulltextConfiguration.propPathsByIndexBinary.get(indexName) == null) { // nothing to do: index not configured for blob continue; } // get original text from all blobs blobsExtractor.setExtractorProperties(fulltextConfiguration.propPathsByIndexBinary.get(indexName), fulltextConfiguration.propPathsExcludedByIndexBinary.get(indexName), fulltextConfiguration.indexesAllBinary.contains(indexName)); List<String> strings = new ArrayList<>(); for (Blob blob : blobsExtractor.getBlobs(document)) { String string = blobsText.computeIfAbsent(blob, this::blobToText); strings.add(string); } // add space at beginning and end for simulated phrase search using LIKE "% foo bar %" String text = " " + String.join(" ", strings) + " "; text = limitStringSize(text, fulltextConfiguration.fulltextFieldSizeLimit); String property = getFulltextPropertyName(SYSPROP_FULLTEXT_BINARY, indexName); for (DocumentRef docRef : docsToUpdate) { session.setDocumentSystemProp(docRef, property, text); } } }