/** * Update the document's output charset. * @param charset the new charset (by name) to use. * @return the document's output settings, for chaining */ public OutputSettings charset(String charset) { charset(Charset.forName(charset)); return this; }
@Override public OutputSettings clone() { OutputSettings clone; try { clone = (OutputSettings) super.clone(); } catch (CloneNotSupportedException e) { throw new RuntimeException(e); } clone.charset(charset.name()); // new charset and charset encoder clone.escapeMode = Entities.EscapeMode.valueOf(escapeMode.name()); // indentAmount, prettyPrint are primitives so object.clone() will handle return clone; } }
public OutputSettings() { charset(Charset.forName("UTF8")); }
.syntax(Document.OutputSettings.Syntax.xml) .escapeMode(Entities.EscapeMode.xhtml) .charset(charset);
doc.outputSettings().charset("UTF-8"); doc.outputSettings().escapeMode(EscapeMode.xhtml); htmlText = Jsoup.clean(doc.body().html(), wl);
document.outputSettings().prettyPrint(false); document.outputSettings().escapeMode(xhtml); document.outputSettings().charset("UTF-8");
Document clean = cleaner.clean(dirty); clean.outputSettings().escapeMode(Entities.EscapeMode.base).charset(StandardCharsets.UTF_8).prettyPrint(false);
Document clean = cleaner.clean(dirty); clean.outputSettings().escapeMode(Entities.EscapeMode.base).charset(StandardCharsets.UTF_8).prettyPrint(false);
@Override public void generatePDF(OutputStream outputStream, Path template, Path renderingRoot, Map<String, Object> context) throws PdfRenderingException { ITextRenderer renderer = new ITextRenderer(); try { String html = templateRenderer.renderAsString(template, context); // Ensure we have a valid XHTML document using JSoup Document jsoupDoc = Jsoup.parse(html); jsoupDoc.outputSettings().syntax(Document.OutputSettings.Syntax.xml); jsoupDoc.outputSettings().escapeMode(Entities.EscapeMode.xhtml); jsoupDoc.outputSettings().charset("UTF-8"); String path = renderingRoot.toAbsolutePath().toUri().toString(); renderer.setDocumentFromString(jsoupDoc.toString(), path); renderer.layout(); renderer.createPDF(outputStream); } catch (DocumentException | TemplateRenderingException e) { throw new PdfRenderingException(e); } } }
static String formatInlineCSS(final String html, final String css) throws IOException, SAXException { org.jsoup.nodes.Document parsed = Jsoup.parse(html, "UTF-8"); parsed.outputSettings().charset("UTF-8"); Document doc = DOMBuilder.jsoup2DOM(parsed); DOMAnalyzer da = new DOMAnalyzer(doc); da.attributesToStyles(); da.addStyleSheet(null, CSSNorm.stdStyleSheet(), DOMAnalyzer.Origin.AGENT); da.addStyleSheet(null, css, null); da.getStyleSheets(); da.stylesToDomInherited(); String result = toString(doc); result = result.replaceAll("class=\"topiclinkwrapper\" style=\"", "class=\"topiclinkwrapper\" style=\"text-overflow: ellipsis;"); // Remove all line breaks result = result.replaceAll("\\n", ""); return result; }
/** * Parses body fragment to the {@code <body>} element. * * @param content * @return the {@code body} element of the parsed content */ private Element parseContent(String content) { Document doc = Jsoup.parseBodyFragment(content); doc.outputSettings().charset(outputEncoding); return doc.body(); }
String url = request.getParameter("htmluri").trim(); System.out.println("Fetching %s..."+url); Document doc = Jsoup.connect(url).get(); Document.OutputSettings settings = doc.outputSettings(); settings.prettyPrint(false); settings.charset("ASCII"); String html = doc.html(); html = StringEscapeUtils.unescapeHtml(html); html = Jsoup.parse(html).html(); //This will take care of any extra closing tags System.out.println(html);
/** * Parses body fragment to the {@code <body>} element. * * @param content * @return the {@code body} element of the parsed content */ private Element parseContent(String content) { Document doc = Jsoup.parseBodyFragment(content); doc.outputSettings().charset(outputEncoding); return doc.body(); }
private static Document parseXhtml(final String inputXhtml) { final Document originalDocument = Jsoup.parse(inputXhtml, "utf-8", Parser.xmlParser()); originalDocument.outputSettings().prettyPrint(false); originalDocument.outputSettings().escapeMode(xhtml); originalDocument.outputSettings().charset("UTF-8"); return originalDocument; }
Document doc = Jsoup.parse("" + "<p>THIS — IS A “TEST”. 5 > 4. trademark: ™</p>"); Document.OutputSettings settings = doc.outputSettings(); settings.prettyPrint(false); settings.escapeMode(Entities.EscapeMode.extended); settings.charset("ASCII"); String modifiedFileHtmlStr = doc.html(); System.out.println(modifiedFileHtmlStr);
doc.outputSettings().charset(charsetName);
/** * Sets the charset used in this document. This method is equivalent * to {@link OutputSettings#charset(java.nio.charset.Charset) * OutputSettings.charset(Charset)} but in addition it updates the * charset / encoding element within the document. * * <p>This enables * {@link #updateMetaCharsetElement(boolean) meta charset update}.</p> * * <p>If there's no element with charset / encoding information yet it will * be created. Obsolete charset / encoding definitions are removed!</p> * * <p><b>Elements used:</b></p> * * <ul> * <li><b>Html:</b> <i><meta charset="CHARSET"></i></li> * <li><b>Xml:</b> <i><?xml version="1.0" encoding="CHARSET"></i></li> * </ul> * * @param charset Charset * * @see #updateMetaCharsetElement(boolean) * @see OutputSettings#charset(java.nio.charset.Charset) */ public void charset(Charset charset) { updateMetaCharsetElement(true); outputSettings.charset(charset); ensureMetaCharsetElement(); }
public Document parse() throws IOException { Validate.isTrue(executed, "Request must be executed (with .execute(), .get(), or .post() before parsing response"); if (byteData != null) { // bytes have been read in to the buffer, parse that bodyStream = new ByteArrayInputStream(byteData.array()); inputStreamRead = false; // ok to reparse if in bytes } Validate.isFalse(inputStreamRead, "Input stream already read and parsed, cannot re-read."); Document doc = DataUtil.parseInputStream(bodyStream, charset, url.toExternalForm(), req.parser()); charset = doc.outputSettings().charset().name(); // update charset from meta-equiv, possibly inputStreamRead = true; safeClose(); return doc; }
/** * Returns the charset used in this document. This method is equivalent * to {@link OutputSettings#charset()}. * * @return Current Charset * * @see OutputSettings#charset() */ public Charset charset() { return outputSettings.charset(); }