@Override public TokenStream create(final TokenStream input) { return new URIDecodingFilter(input, DEFAULT_ENCODING); }
/** * Create a new URI decoding filter configured for the specified charset. * * @param input The input token stream * @param charsetEncoding The name of a supported character encoding. * @throws UnsupportedCharsetException if the character encoding is not supported or recognised. */ public URIDecodingFilter(final TokenStream input, final String charsetEncoding) throws UnsupportedCharsetException { super(input); final Charset charset = this.lookupCharset(charsetEncoding); charsetDecoder = charset.newDecoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); termAtt = this.addAttribute(CharTermAttribute.class); posIncrAtt = this.addAttribute(PositionIncrementAttribute.class); termBuffer = CharBuffer.allocate(256); }
private void assertURLDecodedTo(final Tokenizer t, final String encoding, final String uri, final String[] expectedStems, final String[] expectedTypes, final int[] expectedPosIncr) throws IOException { assertTrue("has CharTermAttribute", t.hasAttribute(CharTermAttribute.class)); final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class)); final TypeAttribute typeAtt = t.getAttribute(TypeAttribute.class); assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class)); final PositionIncrementAttribute posIncrAtt = t.getAttribute(PositionIncrementAttribute.class); t.setReader(new StringReader(uri)); t.reset(); final URIDecodingFilter filter = new URIDecodingFilter(t, encoding); for (int i = 0; i < expectedStems.length; i++) { assertTrue("token " + i + " exists", filter.incrementToken()); assertEquals(expectedStems[i], termAtt.toString()); if (expectedTypes == null) assertEquals(uritype, typeAtt.type()); else assertEquals(expectedTypes[i], typeAtt.type()); if (expectedPosIncr != null) assertEquals(expectedPosIncr[i], posIncrAtt.getPositionIncrement()); } filter.end(); filter.close(); }
final int value = this.hexaToInt2(c1) + this.hexaToInt(c2); if (value == 32) { // replace the SPACE character, encoded by %20, by + this.decodeChars(); termBuffer.put('+'); } else if (value >= 0) { // Negative value are illegal. Just skip it. if (!decoded.hasRemaining()) { // No more place in the buffer, output what is already there. this.decodeChars(); this.decodeChars(); termBuffer.put('%').put(c1).put(c2); this.decodeChars();
@Override public final boolean incrementToken() throws IOException { if (modifiedURI) { // Return the previously decoded URI modifiedURI = false; termAtt.setEmpty(); termAtt.copyBuffer(termBuffer.array(), 0, termBuffer.position()); posIncrAtt.setPositionIncrement(0); return true; } if (input.incrementToken()) { termLength = termAtt.length(); this.updateBuffer(); this.decode(); return true; } return false; }
@Override protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { final WhitespaceTokenizer source = new WhitespaceTokenizer(matchVersion, reader); TokenStream sink = new URIDecodingFilter(source, "UTF-8"); sink = this.applyURINormalisation(sink); sink = new MailtoFilter(sink); sink = new LowerCaseFilter(matchVersion, sink ); sink = new StopFilter(matchVersion, sink, stopSet); sink = new LengthFilter(matchVersion, true, sink, 2, 256); return new TokenStreamComponents(source, sink); }