net.htmlparser.jericho.CharacterReference java code examples

/**
 * Decodes the specified HTML encoded text into normal text.
 * <p>
 * All {@linkplain CharacterEntityReference character entity references} and {@linkplain NumericCharacterReference numeric character references}
 * are converted to their respective characters.
 * <p>
 * This is equivalent to {@link #decode(CharSequence,boolean) decode(encodedText,false)}.
 * <p>
 * <a href="#Unterminated">Unterminated</a> character references are dealt with according to the rules for
 * text outside of attribute values in the {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
 * <p>
 * If the static {@link Config#ConvertNonBreakingSpaces} property is set to <code>true</code> (the default),
 * then all non-breaking space ({@link CharacterEntityReference#_nbsp &amp;nbsp;}) character entity references are converted to normal spaces.
 * <p>
 * Although character entity reference names are case sensitive, and in some cases differ from other entity references only by their case,
 * some browsers also recognise them in a case-insensitive way.
 * For this reason, all decoding methods in this library recognise character entity reference names even if they are in the wrong case.
 *
 * @param encodedText  the text to decode.
 * @return the decoded string.
 * @see #encode(CharSequence)
 */
public static String decode(final CharSequence encodedText) {
  return decode(encodedText,false,Config.ConvertNonBreakingSpaces);
}

/**
 * Re-encodes the specified text, equivalent to {@linkplain #decode(CharSequence) decoding} and then {@linkplain #encode(CharSequence) encoding} again.
 * <p>
 * This process ensures that the specified encoded text does not contain any remaining unencoded characters.
 * <p>
 * IMPLEMENTATION NOTE: At present this method simply calls the {@link #decode(CharSequence) decode} method followed by the
 * {@link #encode(CharSequence) encode} method, both with <code>insideAttributeValue</code> set to <code>true</code>.
 *
 * @param encodedText  the text to re-encode.
 * @return the re-encoded string.
 */
public static String reencode(final CharSequence encodedText) {
  return encode(decode(encodedText,true),true);
}

private String parseText(int start, int end) {
  StringBuilder sb = new StringBuilder();
  while (start < end) {
    CharacterReference ref = source.getNextCharacterReference(start);
    if (ref == null || ref.getBegin() >= end) {
      break;
    }
    sb.append(source.subSequence(start, ref.getBegin()));
    sb.append(ref.getChar());
    start = ref.getEnd();
  }
  sb.append(source.subSequence(start, end));
  return sb.toString();
}

private static void appendTidyValue(final Appendable appendable, final CharSequence unencodedValue) throws IOException {
  CharacterReference.appendEncode(appendable,CharacterReference.decode(unencodedValue,true),true);
}

static String decodeCollapseWhiteSpace(final CharSequence text, final boolean convertNonBreakingSpaces) {
  return decode(appendCollapseWhiteSpace(new StringBuilder(text.length()),text),false,convertNonBreakingSpaces);
}

private void appendCharTo(Appendable appendable, final boolean convertNonBreakingSpaces) throws IOException {
  if (Character.isSupplementaryCodePoint(codePoint)) {
    appendable.append(getHighSurrogate(codePoint));
    appendable.append(getLowSurrogate(codePoint));
  } else {
    final char ch=getChar();
    if (ch==CharacterEntityReference._nbsp && convertNonBreakingSpaces) {
      appendable.append(' ');
    } else {
      appendable.append(ch);
    }
  }
}

private static Appendable appendDecode(final Appendable appendable, final Segment segment, final int searchBegin, final boolean insideAttributeValue, final boolean convertNonBreakingSpaces) throws IOException {
  final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings=Config.CurrentCompatibilityMode.getUnterminatedCharacterReferenceSettings(insideAttributeValue);
 final Source source=segment.source;
 final ParseText parseText=source.getParseText();
 final int end=segment.getEnd();
 int begin=segment.getBegin();
  int pos=parseText.indexOf('&',begin+searchBegin,end);
  while (pos!=-1) {
    final CharacterReference characterReference=CharacterReference.construct(source,pos,unterminatedCharacterReferenceSettings);
    if (characterReference!=null) {
      appendable.append(source.substring(begin,pos)); // Don't use appendable.append(source,begin,pos) as it checks source.length() which may throw an exception when using StreamedSource.
      characterReference.appendCharTo(appendable,convertNonBreakingSpaces);
      begin=characterReference.getEnd();
      pos=parseText.indexOf('&',begin,end);
    } else {
      pos=parseText.indexOf('&',pos+1,end);
    }
  }
  appendable.append(source.substring(begin,end));
  return appendable;
}

/**
 * Encodes the specified text, escaping certain characters into character references.
 * <p>
 * This is equivalent to {@link #encode(CharSequence,boolean) encode(unencodedText,true)}.
 *
 * @param unencodedText  the text to encode.
 * @return the encoded string.
 */
public static String encode(final CharSequence unencodedText) {
  return encode(unencodedText,true);
}

final String getDisplayValueHTML(final CharSequence text, final boolean whiteSpaceFormatting) {
  final StringBuilder sb=new StringBuilder((text==null ? 0 : text.length()*2)+50);
  sb.append('<').append(FormControlOutputStyle.ConfigDisplayValue.ElementName);
  try {
    for (String attributeName : FormControlOutputStyle.ConfigDisplayValue.AttributeNames) {
      final CharSequence attributeValue=elementContainer.getAttributeValue(attributeName);
      if (attributeValue==null) continue;
      Attribute.appendHTML(sb,attributeName,attributeValue);
    }
    sb.append('>');
    if (text==null || text.length()==0) {
      sb.append(FormControlOutputStyle.ConfigDisplayValue.EmptyHTML);
    } else {
      if (whiteSpaceFormatting) {
        sb.append(CharacterReference.encodeWithWhiteSpaceFormatting(text));
      } else {
        CharacterReference.appendEncode(sb,text,false);
      }
    }
  } catch (IOException ex) {throw new RuntimeException(ex);} // never happens
  sb.append(EndTagType.START_DELIMITER_PREFIX).append(FormControlOutputStyle.ConfigDisplayValue.ElementName).append('>');
  return sb.toString();
}

if (!isWhiteSpace(ch)) {
  appendEncode(appendable,ch,false);
  continue;

  private static Appendable appendDecode(final Appendable appendable, final CharSequence encodedText, final int searchBegin, final boolean insideAttributeValue, final boolean convertNonBreakingSpaces) throws IOException {
    if (encodedText instanceof Segment) return appendDecode(appendable,(Segment)encodedText,searchBegin,insideAttributeValue,convertNonBreakingSpaces);
    final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings=Config.CurrentCompatibilityMode.getUnterminatedCharacterReferenceSettings(insideAttributeValue);
    final StreamedSource streamedSource=new StreamedSource(encodedText).setHandleTags(false).setUnterminatedCharacterReferenceSettings(unterminatedCharacterReferenceSettings).setSearchBegin(searchBegin);
    for (Segment segment : streamedSource) {
      if (segment instanceof CharacterReference) {
        ((CharacterReference)segment).appendCharTo(appendable,convertNonBreakingSpaces);
      } else {
        appendable.append(segment.toString()); // benchmark tests reveal (surprisingly) that converting to a string before appending is faster than appending the specified section of the encodedText or segment directly.
//                appendable.append(encodedText,segment.begin,segment.end);
//                appendable.append(segment);
      }
    }
    return appendable;
  }

  static Appendable appendHTML(final Appendable appendable, final CharSequence name, final CharSequence value) throws IOException {
    appendable.append(' ').append(name);
    if (value!=null) {
      appendable.append("=\"");
      CharacterReference.appendEncode(appendable,value,true);
      appendable.append('"');
    }
    return appendable;
  }
}

/**
 * Returns the <a href="NumericCharacterReference.html#DecimalCharacterReference">decimal encoded form</a> of the specified unicode code point.
 * <p>
 * <dl>
 *  <dt>Example:</dt>
 *  <dd><code>CharacterReference.getDecimalCharacterReferenceString('&gt;')</code> returns "<code>&amp;#62;</code>"</dd>
 * </dl>
 *
 * @param codePoint  the unicode code point to encode.
 * @return the decimal encoded form of the specified unicode code point.
 * @see #getCharacterReferenceString(int codePoint)
 * @see #getHexadecimalCharacterReferenceString(int codePoint)
 */
public static String getDecimalCharacterReferenceString(final int codePoint) {
  try {
    return appendDecimalCharacterReferenceString(new StringBuilder(),codePoint).toString();
  } catch (IOException ex) {throw new RuntimeException(ex);} // never happens
}

/**
 * Appends the character represented by this character reference to the specified appendable object.
 * <p>
 * If this character is a unicode <a target="_blank" href="http://unicode.org/glossary/#supplementary_character">supplementary character</a>,
 * then both the UTF-16 high/low surrogate <code>char</code> values of the of the character are appended, as described in the
 * <a target="_blank" href="http://java.sun.com/javase/6/docs/api/java/lang/Character.html#unicode">Unicode character representations</a> section of the
 * <code>java.lang.Character</code> class.
 * <p>
 * If the static {@link Config#ConvertNonBreakingSpaces} property is set to <code>true</code> (the default),
 * then calling this method on a non-breaking space character reference ({@link CharacterEntityReference#_nbsp &amp;nbsp;})
 * results in a normal space being appended.
 *
 * @param appendable  the object to append this character reference to.
 */
public final void appendCharTo(Appendable appendable) throws IOException {
  appendCharTo(appendable,Config.ConvertNonBreakingSpaces);
}

static final String decode(final CharSequence encodedText, final boolean insideAttributeValue, final boolean convertNonBreakingSpaces) {
  if (encodedText==null) return null;
  final String encodedTextString=encodedText.toString(); // converting to string first is faster than searching the CharSequence directly.
  final int firstAmpersandPos=encodedTextString.indexOf('&');
  if (firstAmpersandPos==-1) return encodedTextString;
  try {
    return appendDecode(new StringBuilder(encodedText.length()),encodedText,firstAmpersandPos,insideAttributeValue,convertNonBreakingSpaces).toString();
  } catch (IOException ex) {throw new RuntimeException(ex);} // never happens
}

@RequestMapping(value = { "/user/edit/emailAddresses" }, method = RequestMethod.GET)
public ModelAndView handleEmailAddresses(@RequestParam(value = "newEmail", required=false) String email) {
  UserProfile userProfile = this.userBusinessService.getCurrentUserProfile();
  Map<String, Object> model = getSynchronizedEmailAddressesModel(userProfile);
  model.put("newEmail", CharacterReference.encode(email));
  
  return new ModelAndView(USER_EMAIL_ADRESSES_EDIT, model);
}

/**
 * Encodes the specified character into a character reference if {@linkplain Config#CurrentCharacterReferenceEncodingBehaviour required}.
 * <p>
 * The encoding of the character follows the same rules as for each character in the {@link #encode(CharSequence unencodedText, boolean insideAttributeValue)} method,
 * with <code>insideAttributeValue</code> set to <code>true</code>.
 *
 * @param ch  the character to encode.
 * @return a character reference if appropriate, otherwise a string containing the original character.
 */
public static String encode(final char ch) {
  try {
    return appendEncode(new StringBuilder(MAX_ENTITY_REFERENCE_LENGTH),ch,true).toString();
  } catch (IOException ex) {throw new RuntimeException(ex);} // never happens
}

private static final Appendable appendEncode(final Appendable appendable, final char ch, final boolean insideAttributeValue) throws IOException {
  if (Config.CurrentCharacterReferenceEncodingBehaviour.isEncoded(ch,insideAttributeValue)) {
    final String characterEntityReferenceName=CharacterEntityReference.getName(ch);
    if (characterEntityReferenceName!=null && ch!='\'') {
      CharacterEntityReference.appendCharacterReferenceString(appendable,characterEntityReferenceName);
    } else {
      appendDecimalCharacterReferenceString(appendable,ch);
    }
  } else {
    appendable.append(ch);
  }
  return appendable;
}

/**
 * Decodes the specified HTML encoded text into normal text.
 * <p>
 * All {@linkplain CharacterEntityReference character entity references} and {@linkplain NumericCharacterReference numeric character references}
 * are converted to their respective characters.
 * <p>
 * <a href="#Unterminated">Unterminated</a> character references are dealt with according to the
 * value of the <code>insideAttributeValue</code> parameter and the
 * {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
 * <p>
 * If the static {@link Config#ConvertNonBreakingSpaces} property is set to <code>true</code> (the default),
 * then all non-breaking space ({@link CharacterEntityReference#_nbsp &amp;nbsp;}) character entity references are converted to normal spaces.
 * <p>
 * Although character entity reference names are case sensitive, and in some cases differ from other entity references only by their case,
 * some browsers also recognise them in a case-insensitive way.
 * For this reason, all decoding methods in this library recognise character entity reference names even if they are in the wrong case.
 *
 * @param encodedText  the text to decode.
 * @param insideAttributeValue  specifies whether the encoded text is inside an attribute value.
 * @return the decoded string.
 * @see #decode(CharSequence)
 * @see #encode(CharSequence)
 */
public static String decode(final CharSequence encodedText, final boolean insideAttributeValue) {
  return decode(encodedText,insideAttributeValue,Config.ConvertNonBreakingSpaces);
}

private String parseText(int start, int end) {
  StringBuilder sb = new StringBuilder();
  while (start < end) {
    CharacterReference ref = source.getNextCharacterReference(start);
    if (ref == null || ref.getBegin() >= end) {
      break;
    }
    sb.append(source.subSequence(start, ref.getBegin()));
    sb.append(ref.getChar());
    start = ref.getEnd();
  }
  sb.append(source.subSequence(start, end));
  return sb.toString();
}

Javadoc

Represents an HTML Character Reference, implemented by the subclasses CharacterEntityReference and NumericCharacterReference.

This class, together with its subclasses, contains static methods to perform most required operations without having to instantiate an object.

Instances of this class are useful when the positions of character references in a source document are required, or to replace the found character references with customised text.

CharacterReference instances are obtained using one of the following methods:

CharacterReference#parse(CharSequence characterReferenceText)
Source#getNextCharacterReference(int pos)
Source#getPreviousCharacterReference(int pos)
Segment#getAllCharacterReferences()

Most used methods

decode
encode
Encodes the specified text, escaping certain characters into character references. The Config#Curren
getChar
Returns the character represented by this character reference. If this character reference represent
getEnd
appendCharTo
appendCollapseWhiteSpace
appendDecimalCharacterReferenceString
appendDecode
appendEncode
appendHexadecimalCharacterReferenceString
appendUnicodeText
construct

Popular in Java

Reading from database using SQL prepared statement
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
notifyDataSetChanged (ArrayAdapter)
setScale (BigDecimal)
EOFException (java.io)
Thrown when a program encounters the end of a file or stream during an input operation.
InputStream (java.io)
A readable source of bytes.Most clients will use input streams that read data from the file system (
PriorityQueue (java.util)
A PriorityQueue holds elements on a priority heap, which orders the elements according to their natu
Window (java.awt)
A Window object is a top-level window with no borders and no menubar. The default layout for a windo
JFileChooser (javax.swing)
Location (org.springframework.beans.factory.parsing)
Class that models an arbitrary location in a Resource.Typically used to track the location of proble
Top Sublime Text plugins

How to useCharacterReference in net.htmlparser.jericho

Best Java code snippets using net.htmlparser.jericho.CharacterReference (Showing top 20 results out of 315)

How to use
CharacterReference
in
net.htmlparser.jericho