public String getDescription(String page) { try { // Fetch the image page Response resp = Http.url(page) .referrer(this.url) .response(); cookies.putAll(resp.cookies()); // Try to find the description Elements els = resp.parse().select("td[class=alt1][width=\"70%\"]"); if (els.isEmpty()) { LOGGER.debug("No description at " + page); throw new IOException("No description found"); } LOGGER.debug("Description found!"); Document documentz = resp.parse(); Element ele = documentz.select("td[class=alt1][width=\"70%\"]").get(0); // This is where the description is. // Would break completely if FurAffinity changed site layout. documentz.outputSettings(new Document.OutputSettings().prettyPrint(false)); ele.select("br").append("\\n"); ele.select("p").prepend("\\n\\n"); LOGGER.debug("Returning description at " + page); String tempPage = Jsoup.clean(ele.html().replaceAll("\\\\n", System.getProperty("line.separator")), "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false)); return documentz.select("meta[property=og:title]").attr("content") + "\n" + tempPage; // Overridden saveText takes first line and makes it the file name. } catch (IOException ioe) { LOGGER.info("Failed to get description " + page + " : '" + ioe.getMessage() + "'"); return null; } } @Override
final EscapeMode escapeMode = out.escapeMode(); final CharsetEncoder encoder = out.encoder(); final CoreCharset coreCharset = out.coreCharset; // init in out.prepareEncoder() final int length = string.length();
OutputSettings.Syntax syntax = outputSettings().syntax(); Element metaCharset = select("meta[charset]").first(); metaCharset.attr("charset", charset().displayName()); } else { Element head = head();
private static String reformatXHtml(final String inputXhtml, final Map<String, ConfluenceLink> confluenceLinkMap) { final Document document = Jsoup.parse(inputXhtml, "utf-8", Parser.xmlParser()); document.outputSettings().prettyPrint(false); document.outputSettings().escapeMode(xhtml); document.outputSettings().charset("UTF-8"); final Elements linkElements = document.select("a");
@Override public Document parse(String content) { Document doc = Jsoup.parse(content, "", Parser.xmlParser()); doc.outputSettings().prettyPrint(false); return doc; }
Document doc = Jsoup.parse(html); if (truncateStuff) { Elements dataSrcs = doc.select("img[src^=data:]"); for (Element dataSrc : dataSrcs) { if (dataSrc.attr("src").length() <= MAX_DATASRC_ATTR_LENGTH) { + OVERLENGTH_INDICATOR); Elements headStyles = doc.select("head style"); for (Element content : headStyles) { if (content.data().length() <= MAX_DATASRC_ATTR_LENGTH) { doc.outputSettings().prettyPrint(true); doc.outputSettings().indentAmount(4); return doc.toString();
/** * Fix unclosed tags by loading into and out of JSoup * * @param badXml * @return * @throws IOException * @throws PatentReaderException */ public static Document fixTagsJDOM(String badXml) throws IOException, PatentReaderException { org.jsoup.nodes.Document jsoupDoc = Jsoup.parse("<body>" + badXml + "</body>", "", Parser.xmlParser().settings(ParseSettings.preserveCase)); jsoupDoc.outputSettings().prettyPrint(false); String doc = jsoupDoc.select("body").html(); // Add HTML DTD to ensure HTML entities do not cause any problems. doc = "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n" + doc; try { SAXReader sax = new SAXReader(false); sax.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); return sax.read(new StringReader(doc)); } catch (DocumentException | SAXException e) { throw new PatentReaderException("Failed to Fix and Parse Docuemnt", e); } } }
private InputStream convertHTMLToPDFInputStream(InputStream paramSource) throws ConversionException { Document document; try { document = Jsoup.parse(paramSource, getConfiguredCharset(), ""); } catch (IOException e) { throw new ConversionException( "Conversion is not possible due to an IOException. Error message is " + e.getMessage(), e); } document.outputSettings().syntax(Document.OutputSettings.Syntax.xml); String xhtmlString = document.html(); ByteArrayInputStream bais = new ByteArrayInputStream(xhtmlString.getBytes()); return convertXHTMLToPDFInputStream(bais); }
Document jsoupDoc = Jsoup.parse("<body>" + rawText + "</body>", "", Parser.xmlParser()); jsoupDoc.outputSettings().prettyPrint(false).charset(StandardCharsets.UTF_16); Elements figEls = jsoupDoc.select("a.figref"); for (int i = 1; i <= figEls.size(); i++) { Element element = figEls.get(i - 1); Elements headerEls = jsoupDoc.select("PAC"); for (int i = 1; i <= headerEls.size(); i++) { Element element = headerEls.get(i - 1); whitelist.addAttributes(":all", HTML_WHITELIST_ATTRIB); OutputSettings outSettings = new Document.OutputSettings(); outSettings.charset(Charsets.UTF_16); outSettings.prettyPrint(false);
public String getHtmlBody(String text) { if (StringUtils.isBlank(text)) return text; org.jsoup.nodes.Document document = org.jsoup.Jsoup.parse(text); document.outputSettings().prettyPrint(false); return document.body().html(); }
Document headingText = Jsoup.parseBodyFragment(getHeadingText(reader, headingLevel), ""); headingText.outputSettings().prettyPrint(false); Elements headingLinks = headingText.select("a"); for (Element headingLink : headingLinks) lastLevel = insertInBuilderStructure(headingLevel, headingText.body().text(), headingId, outlineBuilder, lastLevel);
Jsoup.parse("<!DOCTYPE html>\n<html lang=\"" + da.getLanguage() + "\"></html>"); doc.outputSettings(new Document.OutputSettings().prettyPrint(false)); final Element head = doc.head(); doc.title(title);
/** * Pre-process HTML. * * @param request the request * @param is the input stream * @return the updated input stream * @throws IOException Signals that an I/O exception has occurred. */ protected InputStream preProcessHtml(Request request, InputStream is) throws IOException { if (request == null || is == null || !Boolean.parseBoolean(request.getParameter("pre-parse-html"))) { return is; } try { org.jsoup.nodes.Document doc = Jsoup.parse(is, "ISO-8859-9", "/"); doc.outputSettings().escapeMode(EscapeMode.xhtml); return new ByteArrayInputStream(doc.outerHtml().getBytes()); } finally { IOUtils.closeQuietly(is); } }
/** * Parses body fragment to the {@code <body>} element. * * @param content * @return the {@code body} element of the parsed content */ private Element parseContent(String content) { Document doc = Jsoup.parseBodyFragment(content); doc.outputSettings().charset(outputEncoding); return doc.body(); }
Document doc = Jsoup.parse(htmlText); doc.outputSettings().charset("UTF-8"); doc.outputSettings().escapeMode(EscapeMode.xhtml); htmlText = Jsoup.clean(doc.body().html(), wl); htmlText = StringEscapeUtils.unescapeHtml(htmlText); if (optAsBoolean("alphanumeric", false, c, parameters, dataStreams)) {
@Override public String stripHtmlFromText(String text, boolean smartSpacing, boolean stripEscapeSequences) { if (StringUtils.isBlank(text)) return text; if (smartSpacing) { text = text.replaceAll("/br>", "/br> ").replaceAll("/p>", "/p> ").replaceAll("/tr>", "/tr> "); } if (stripEscapeSequences) { org.jsoup.nodes.Document document = org.jsoup.Jsoup.parse(text); org.jsoup.nodes.Element body = document.body(); //remove any html tags, unescape any escape characters text = body.text(); // are converted to char code 160, java doesn't treat it like whitespace, so replace it with ' ' text = text.replace((char)160, ' '); } else { text = org.jsoup.Jsoup.clean(text, "", org.jsoup.safety.Whitelist.none(), new org.jsoup.nodes.Document.OutputSettings().prettyPrint(false).outline(false)); } if (smartSpacing || stripEscapeSequences) { text = text.replaceAll("\\s+", " "); } return text.trim(); }
public static RichText fromHtml(String html) { Document doc = Jsoup.parse(html); doc.outputSettings().prettyPrint(false); RichText root = new RichText(""); parse(root, doc.getElementsByTag("body").get(0)); return root; }
@Override public void generatePDF(OutputStream outputStream, Path template, Path renderingRoot, Map<String, Object> context) throws PdfRenderingException { ITextRenderer renderer = new ITextRenderer(); try { String html = templateRenderer.renderAsString(template, context); // Ensure we have a valid XHTML document using JSoup Document jsoupDoc = Jsoup.parse(html); jsoupDoc.outputSettings().syntax(Document.OutputSettings.Syntax.xml); jsoupDoc.outputSettings().escapeMode(Entities.EscapeMode.xhtml); jsoupDoc.outputSettings().charset("UTF-8"); String path = renderingRoot.toAbsolutePath().toUri().toString(); renderer.setDocumentFromString(jsoupDoc.toString(), path); renderer.layout(); renderer.createPDF(outputStream); } catch (DocumentException | TemplateRenderingException e) { throw new PdfRenderingException(e); } } }
/** * Takes an input string representing an html document and processes it with * the Css Inliner. * * @param input the html document * @return the processed html document */ public String process(String input) { Document doc = Jsoup.parse(input); // check if the user wants to inline the data Elements elements = doc.getElementsByAttributeValue(DATA_INLINE_ATTR, "true"); if (elements.isEmpty()) { return input; } extractStyles(doc); applyStyles(doc); inlineImages(doc); doc.outputSettings(doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml).prettyPrint(false).escapeMode(Entities.EscapeMode.extended)); return StringEscapeUtils.unescapeHtml(doc.outerHtml()); }
@Override public Document process(final Metadata metadata, final Document document) { // If we have 1 node, it a p, it contains only text nodes, then treat it as pre if (document.body().children().size() == 1) { final Elements paragraphs = document.select("body > p"); if (paragraphs.size() == 1 && isAllTextNodes(paragraphs.first())) { paragraphs.first().tagName("pre"); document.body().textNodes().forEach(TextNode::remove); document.outputSettings().prettyPrint(false); } } return document; }