@Test public void testUPlus() { final UnicodeUnescaper uu = new UnicodeUnescaper(); final String input = "\\u+0047"; assertEquals("Failed to unescape Unicode characters with 'u+' notation", "G", uu.translate(input)); }
/** * By default html tags and special characters are stripped from the matches * * @param group * @return */ private String cleanHtml(String group) { if (group == null) return ""; LOGGER.trace("Before Clean Html: " + group); // String s = group.replaceAll("<[^>]+>", ""); String s = Jsoup.parse(uu.translate(group)).body().text(); LOGGER.trace("After Clean Html: " + s); return s; }
protected Document parseXmlString(String xmlString) throws Exception { DocumentBuilder parser = factory.newDocumentBuilder(); String xml = xmlString; // xml = Utils.replaceAcutesHTML(xml); xml = uu.translate(xml); xml = StringEscapeUtils.unescapeHtml4(xml); xml = StringEscapeUtils.unescapeXml(xml); xml = StringEscapeUtils.unescapeXml(xml); xml = StringEscapeUtils.unescapeXml(xml); xml = xml.replaceAll("\\&", "\\&"); // xml = StringEscapeUtils.escapeXml(xml); Document doc = null; for (String charset : new String[] { "UTF-8", "ISO-8859-1", "US-ASCII" }) { try { doc = parser.parse(new ByteArrayInputStream(xml.getBytes(charset))); break; } catch (Throwable t) { LOGGER.error("Failed to parse xml using charset: " + charset + " - " + t.getMessage()); } } if (doc == null) { LOGGER.error("Unabled to parse xml string"); LOGGER.error(xml); throw new Exception("Unable to parse xml!"); } return doc; }
@Test public void testUuuuu() { final UnicodeUnescaper uu = new UnicodeUnescaper(); final String input = "\\uuuuuuuu0047"; final String result = uu.translate(input); assertEquals("Failed to unescape Unicode characters with many 'u' characters", "G", result); }
@Test public void testLessThanFour() { final UnicodeUnescaper uu = new UnicodeUnescaper(); final String input = "\\0047\\u006"; try { uu.translate(input); fail("A lack of digits in a Unicode escape sequence failed to throw an exception"); } catch(final IllegalArgumentException iae) { // expected } } }
private String escapeValue(String value) { // Use Apache Commons to escape the value String result = StringEscapeUtils.escapeJava(value); //The above also encodes unicode characters to \u0048\u0065\u006C\u006C etc sequences. To remove that, we do return new UnicodeUnescaper().translate(result); }