/** Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML (non-HTML) parser. @param html HTML to parse @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur before the HTML declares a {@code <base href>} tag. @param parser alternate {@link Parser#xmlParser() parser} to use. @return sane HTML */ public static Document parse(String html, String baseUri, Parser parser) { return parser.parseInput(html, baseUri); }
doc = parser.parseInput(docData, baseUri); reader.skip(1); try { doc = parser.parseInput(reader, baseUri); } catch (UncheckedIOException e) {
private String readTaskFormName(DataInputAssociation inputAssociation) { Optional<FormalExpression> optional = inputAssociation.getAssignment() .stream() .filter(assignment -> assignment.getFrom() != null && assignment.getFrom() instanceof FormalExpression) .map(assignment -> (FormalExpression)assignment.getFrom()) .findAny(); if(optional.isPresent()) { return Parser.xmlParser().parseInput(optional.get().getBody(), "").toString(); } return ""; }
try { Parser parser = Parser.htmlParser().setTrackErrors(0); @Nonnull Document doc = parser.parseInput(html, ""); @Nonnull Elements tags = doc.select(tagName);
/** * Attempt to find a META tag in the HTML that hints at the character set * used to write the document. */ private static String getCharsetFromMeta(byte buffer[], int maxlength) { // convert to UTF-8 String -- which hopefully will not mess up the // characters we're interested in... int len = buffer.length; if (maxlength > 0 && maxlength < len) { len = maxlength; } String html = new String(buffer, 0, len, DEFAULT_CHARSET); Document doc = Parser.htmlParser().parseInput(html, "dummy"); // look for <meta http-equiv="Content-Type" // content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312"> Elements metaElements = doc .select("meta[http-equiv=content-type], meta[charset]"); String foundCharset = null; for (Element meta : metaElements) { if (meta.hasAttr("http-equiv")) foundCharset = getCharsetFromContentType(meta.attr("content")); if (foundCharset == null && meta.hasAttr("charset")) foundCharset = meta.attr("charset"); if (foundCharset != null) return foundCharset; } return foundCharset; }
@Override public void handle(SkypeImpl skype, JsonObject resource) throws ConnectionException, ChatNotFoundException, IOException { String content = Utils.getString(resource, "content"); String chatId = Utils.getString(resource, "conversationLink"); String author = getAuthor(resource); Validate.notNull(content, "Null content"); Validate.notNull(chatId, "Null chat"); Validate.notNull(author, "Null author"); String username = getUsername(author); Validate.notNull(username, "Null username"); Chat chat = getChat(chatId, skype); Validate.notNull(chat, "Null chatobj"); Participant initiator = chat.getParticipant(username); Validate.notNull(initiator, "Null initiator"); Document doc = Parser.xmlParser().parseInput(content, ""); List<ReceivedFile> receivedFiles = doc .getElementsByTag("file") .stream() .map(fe -> new ReceivedFileImpl(fe.text(), Long.parseLong(fe.attr("size")), Long.parseLong(fe.attr("tid")))) .collect(Collectors.toList()); FileReceivedEvent event = new FileReceivedEvent(chat, initiator, receivedFiles); skype.getEventDispatcher().callEvent(event); } },
.decode(ByteBuffer.wrap(content)).toString(); jsoupDoc = Parser.htmlParser().parseInput(html, url);
if (!StringUtils.isEmpty(taskName)) { taskName = Parser.xmlParser().parseInput(taskName, "").toString(); formVariables.setTaskName(taskName);
Participant u = getUser(from, c); String content = resource.get("content").asString(); Document doc = Parser.xmlParser().parseInput(content, ""); if (doc.getElementsByTag("meta").size() == 0) { throw new IllegalArgumentException("No meta? " + resource);
@Test public void testExclusionCase() throws IOException { Config conf = new Config(); conf.put(TextExtractor.EXCLUDE_PARAM_NAME, "style"); TextExtractor extractor = new TextExtractor(conf); String content = "<html>the<STYLE>main</STYLE>content of the page</html>"; Document jsoupDoc = Parser.htmlParser().parseInput(content, "http://stormcrawler.net"); String text = extractor.text(jsoupDoc.body()); assertEquals("the content of the page", text); }
@Test public void testMainContent() throws IOException { Config conf = new Config(); conf.put(TextExtractor.INCLUDE_PARAM_NAME, "DIV[id=\"maincontent\"]"); TextExtractor extractor = new TextExtractor(conf); String content = "<html>the<div id='maincontent'>main<div>content</div></div>of the page</html>"; Document jsoupDoc = Parser.htmlParser().parseInput(content, "http://stormcrawler.net"); String text = extractor.text(jsoupDoc.body()); assertEquals("main content", text); }
@Test public void testExclusion() throws IOException { Config conf = new Config(); conf.put(TextExtractor.EXCLUDE_PARAM_NAME, "STYLE"); TextExtractor extractor = new TextExtractor(conf); String content = "<html>the<style>main</style>content of the page</html>"; Document jsoupDoc = Parser.htmlParser().parseInput(content, "http://stormcrawler.net"); String text = extractor.text(jsoupDoc.body()); assertEquals("the content of the page", text); }