List<Node> parseFragment(String inputFragment, String baseUri, ParseErrorList errors, ParseSettings settings) { initialiseParse(new StringReader(inputFragment), baseUri, errors, settings); runParser(); return doc.childNodes(); } }
Node node = childNodes().get(0);
return root.childNodes(); else return doc.childNodes();
String html = "<html><body><div><p>Test Data</p> <div> <p>HELLO World</p></div></div> other text</body></html>"; Document doc = Jsoup.parse(html); List<Node> children = doc.childNodes(); // We will search nodes in a breadth-first way Queue<Node> nodes = new ArrayDeque<>(); nodes.addAll(doc.childNodes()); while (!nodes.isEmpty()) { Node n = nodes.remove(); if (n instanceof TextNode && ((TextNode) n).text().trim().length() > 0) { // Do whatever you want with n. // Here we just print its text... System.out.println(n.parent().nodeName()+" contains text: "+((TextNode) n).text().trim()); } else { nodes.addAll(n.childNodes()); } }
import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Node; import org.jsoup.parser.Parser; public class Test { public static void main(String[] args) { String xml = "<schemans2:ServicepointForAccountRow AccountID=\"123456\" ServicePointID=\"987654\" LongDescription=\"TE Fix Network RES SINGLE PHS/TEP 13 MR/FN TEP Rt 0010/3220 W INA RD, 13203, TUCSON, AZ, 85741-2169, TEP\" UsageInfo=\"Add\" />"; Document doc = Jsoup.parse(xml, "", Parser.xmlParser()); List<Node> nodes = doc.childNodes(); for(Node n : nodes) { System.out.println(n.attr("ServicePointID")); } } }
List<Node> parseFragment(String inputFragment, String baseUri, ParseErrorList errors) { this.initialiseParse(inputFragment, baseUri, errors); this.runParser(); return this.doc.childNodes(); } }
/** * Each HTML or HTML5 document must begin with a valid doctype declaration. * Is considered valid a doctype with "html" name and no attribute. */ public void validateRpd6s1() { boolean validDocumentType = this.html5Document.childNodes().stream() .filter(node -> node instanceof DocumentType) .anyMatch(documentType -> "html".equalsIgnoreCase(documentType.attr("name")) && StringUtils .isAllEmpty(documentType.attr("publicId"), documentType.attr("systemId"))); assertTrue(Type.ERROR, "rpd6s1.doctype", validDocumentType); }
public static String autoDigest(String str, int size) { StringBuilder sb = new StringBuilder(); Document document = Jsoup.parseBodyFragment(str); List<Node> allTextNode = new ArrayList<>(); getAllTextNode(document.childNodes(), allTextNode); int tLength = 0; for (Node node : allTextNode) { if (node instanceof TextNode) { sb.append(node.parent().outerHtml()); tLength += ((TextNode) node).text().length(); if (tLength > size) { sb.append(" ..."); break; } } } String digest = sb.toString(); Elements elements = Jsoup.parse(str).body().select("video"); if (elements != null && !elements.isEmpty()) { digest = elements.get(0).toString() + "<br/>" + digest; } return digest.trim(); }
for( Node node : doc.childNodes() ) // Iterate over all elements in the document
for (org.jsoup.nodes.Node n : d.childNodes()) { createDOM(n, out,doc,ns);
for (org.jsoup.nodes.Node n : d.childNodes()) { createDOM(n, out,doc,ns);
for (org.jsoup.nodes.Node n : d.childNodes()) { createDOM(n, out,doc,ns);
for (org.jsoup.nodes.Node n : d.childNodes()) { createDOM(n, out,doc,ns);
org.jsoup.nodes.Document d = ((org.jsoup.nodes.Document) node); for (org.jsoup.nodes.Node n : d.childNodes()) { createDOM(n, out, doc, ns);
for (org.jsoup.nodes.Node n : d.childNodes()) { createDOM(n, out, doc, ns);
@Test public void testTimeMagazine() { Article article = TestHelper.extractFromTestFile("http://content.time.com/time/health/article/0,8599,2011497,00.html", "time.html"); assertStartsWith("This month, the federal government released", article.document.child(0).text()); assertEquals(article.document.childNodes().toString(), "http://img.timeinc.net/time/daily/2010/1008/360_bp_oil_spill_0817.jpg", article.imageUrl); }
public static Article extractFromTestFile(String baseUri, String testFile) { try { Article article = ArticleExtractor.with(baseUri, CharsetConverter.readStream(new FileInputStream(new File("test_data/" + testFile))).content) .extractMetadata() .extractContent() .estimateReadingTime() .article(); Log.i("%s", article.document.childNodes().toString()); return article; } catch (FileNotFoundException e) { fail(e.getMessage()); return null; } } }
return root.childNodes(); else return doc.childNodes();