/** * Get the unencoded XML declaration. * @return XML declaration */ public String getWholeDeclaration() { StringBuilder sb = new StringBuilder(); try { getWholeDeclaration(sb, new Document.OutputSettings()); } catch (IOException e) { throw new SerializationException(e); } return sb.toString().trim(); }
public String getDescription(String page) { try { // Fetch the image page Response resp = Http.url(page) .referrer(this.url) .response(); cookies.putAll(resp.cookies()); // Try to find the description Elements els = resp.parse().select("td[class=alt1][width=\"70%\"]"); if (els.isEmpty()) { LOGGER.debug("No description at " + page); throw new IOException("No description found"); } LOGGER.debug("Description found!"); Document documentz = resp.parse(); Element ele = documentz.select("td[class=alt1][width=\"70%\"]").get(0); // This is where the description is. // Would break completely if FurAffinity changed site layout. documentz.outputSettings(new Document.OutputSettings().prettyPrint(false)); ele.select("br").append("\\n"); ele.select("p").prepend("\\n\\n"); LOGGER.debug("Returning description at " + page); String tempPage = Jsoup.clean(ele.html().replaceAll("\\\\n", System.getProperty("line.separator")), "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false)); return documentz.select("meta[property=og:title]").attr("content") + "\n" + tempPage; // Overridden saveText takes first line and makes it the file name. } catch (IOException ioe) { LOGGER.info("Failed to get description " + page + " : '" + ioe.getMessage() + "'"); return null; } } @Override
String html = "<p>Arbit string <b>of</b><br><br>text. <em>What</em> to <strong>do</strong> with it?"; String cleaned = Jsoup.clean(html, "", Whitelist.simpleText().addTags("br"), new Document.OutputSettings().prettyPrint(false)); System.out.println(cleaned);
whitelist.addAttributes(":all", HTML_WHITELIST_ATTRIB); OutputSettings outSettings = new Document.OutputSettings(); outSettings.charset(Charsets.UTF_16); outSettings.syntax(Syntax.xml);
whitelist.addAttributes(":all", HTML_WHITELIST_ATTRIB); OutputSettings outSettings = new Document.OutputSettings(); outSettings.charset(Charsets.UTF_16); outSettings.syntax(Syntax.xml);
whitelist.addAttributes(":all", HTML_WHITELIST_ATTRIB); OutputSettings outSettings = new Document.OutputSettings(); outSettings.charset(Charsets.UTF_16); outSettings.syntax(Syntax.xml);
whitelist.addAttributes(":all", HTML_WHITELIST_ATTRIB); OutputSettings outSettings = new Document.OutputSettings(); outSettings.charset(Charsets.UTF_16); outSettings.prettyPrint(false);
doc.outputSettings(new Document.OutputSettings().prettyPrint(false)); final Element head = doc.head();
public class HtmlWithLineBreaks { public String getCleanHtml(Document document) { document.outputSettings(new Document.OutputSettings().prettyPrint(false)); //makes html() call preserve linebreaks and spacing return Jsoup.clean(document.html(), "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false)); } public static void main(String... args) { File input = new File("/path/to/some/input.html"); //Just replace the input with you own html file source Document document; try { document = Jsoup.parse(input, "UTF-8"); String printOut = new HtmlWithLineBreaks().getCleanHtml(document); System.out.println(printOut); } catch (IOException e) { e.printStackTrace(); } } }
@Override public String stripHtmlFromText(String text, boolean smartSpacing, boolean stripEscapeSequences) { if (StringUtils.isBlank(text)) return text; if (smartSpacing) { text = text.replaceAll("/br>", "/br> ").replaceAll("/p>", "/p> ").replaceAll("/tr>", "/tr> "); } if (stripEscapeSequences) { org.jsoup.nodes.Document document = org.jsoup.Jsoup.parse(text); org.jsoup.nodes.Element body = document.body(); //remove any html tags, unescape any escape characters text = body.text(); // are converted to char code 160, java doesn't treat it like whitespace, so replace it with ' ' text = text.replace((char)160, ' '); } else { text = org.jsoup.Jsoup.clean(text, "", org.jsoup.safety.Whitelist.none(), new org.jsoup.nodes.Document.OutputSettings().prettyPrint(false).outline(false)); } if (smartSpacing || stripEscapeSequences) { text = text.replaceAll("\\s+", " "); } return text.trim(); }
d.outputSettings(new Document.OutputSettings().escapeMode(EscapeMode.xhtml).prettyPrint(false));
String pretty = Jsoup.clean("<img src=\"marco\">Capretta</img><i>Sono misterioso</i><p color=\"white\"><font size=\"5\">Ciao</p><p>some text</p><br/> <p>another text</p></font>" , "", Whitelist.basic().addTags("br", "p","i"), new Document.OutputSettings().prettyPrint(true)); pretty= Jsoup.parse(pretty).getElementsByTag("body").get(0).children().toString(); System.out.println(pretty);
Document.OutputSettings outputSettings = new Document.OutputSettings() .prettyPrint(false); body = Jsoup.clean(body, "", Whitelist.relaxed(), outputSettings);
Document doc = Jsoup.parse(sample); doc.outputSettings(new Document.OutputSettings().prettyPrint(false)); String output = doc.body().html();
/** * A method which parses html using Jsoup, * @param htmlText a text to parse. * @return a document with parsed text. */ private Document getParsedHtmlDocument(String htmlText) { Document doc = Jsoup.parseBodyFragment(htmlText); doc.outputSettings(new Document.OutputSettings().prettyPrint(false)); doc.head().append(DOC_STYLE); doc.body().append(HIGHLIGHT_JS_SCRIPT); return doc; }
public class Test { public static void main(String[] args) { String s="<p>Text<br /> New Text<br />Second Text<br />Third Text</p>"; Document document = Jsoup.parse(s); document.outputSettings(new Document.OutputSettings().prettyPrint(false)); document.select("br").append("\\n"); document.select("p").prepend("\\n\\n"); String s1 = document.html().replaceAll("\\\\n", "\n"); System.out.println(Jsoup.clean(s1, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false))); } }
public String getCleanHtml(Document document) { document.outputSettings(new Document.OutputSettings().prettyPrint(false)); document.select("h1").parents().select("div").append("\n"); // Insert a linebreak after the h1 div parent. return Jsoup.clean(document.html(), "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false)); }
// breaks multi-level of escaping, preventing &lt;script&gt; to be rendered as <script> String replace = input.replace("&", ""); // decode any encoded html, preventing <script> to be rendered as <script> String html = StringEscapeUtils.unescapeHtml(replace); // remove all html tags, but maintain line breaks String clean = Jsoup.clean(html, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false)); // decode html again to convert character entities back into text return StringEscapeUtils.unescapeHtml(clean);
public static String cleanNoMarkup(String input) { final Document.OutputSettings outputSettings = new Document.OutputSettings().prettyPrint(false); String output = Jsoup.clean(input, "", Whitelist.none(), outputSettings); return output; }
public static String br2nl(String html) { if(html==null) return html; Document document = Jsoup.parse(html); document.outputSettings(new Document.OutputSettings().prettyPrint(false));//makes html() preserve linebreaks and spacing document.select("br").append("\\n"); document.select("p").prepend("\\n\\n"); String s = document.html().replaceAll("\\\\n", "\n"); return Jsoup.clean(s, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false)); }