@Override public List<String> getDescriptionsFromPage(Document page) { List<String> urls = new ArrayList<>(); Elements urlElements = page.select("figure.t-image > b > u > a"); for (Element e : urlElements) { urls.add(urlBase + e.select("a").first().attr("href")); LOGGER.debug("Desc2 " + urlBase + e.select("a").first().attr("href")); } return urls; } @Override
@Override public List<String> getURLsFromPage(Document doc) { List<String> result = new ArrayList<>(); Element elem = doc.select("div[id=cc-comicbody] > a > img[id=cc-comic]").first(); // The site doesn't return properly encoded urls we replace all spaces ( ) with %20 result.add(elem.attr("src").replaceAll(" ", "%20")); return result; }
@Override public Document getNextPage(Document doc) throws IOException { if (doc.select("a[data-page=next]").first() != null) { if (doc.select("a[data-page=next]").first().attr("href").startsWith("http")) { return Http.url(doc.select("a[data-page=next]").first().attr("href")).get(); } } throw new IOException("No more pages"); }
/** * Get the form element's value of the first matched element. * @return The form element's value, or empty if not set. * @see Element#val() */ public String val() { if (size() > 0) return first().val(); else return ""; }
public Element $element(String selector) { Elements elements = $(selector); if (elements != null && elements.size() > 0) { return elements.first(); } return null; }
@Override public Document getNextPage(Document doc) throws IOException { // luscious sends xhr requests to nextPageUrl and appends new set of images to the current page while in browser. // Simply GET the nextPageUrl also works. Therefore, we do this... Element nextPageElement = doc.select("div#next_page > div > a").first(); if (nextPageElement == null) { throw new IOException("No next page found."); } return Http.url(nextPageElement.attr("abs:href")).get(); }
/** Get the string contents of the document's {@code title} element. @return Trimmed title, or empty string if none set. */ public String title() { // title is a preserve whitespace tag (for document output), but normalised here Element titleEl = getElementsByTag("title").first(); return titleEl != null ? StringUtil.normaliseWhitespace(titleEl.text()).trim() : ""; }
public static String getFirstImageSrc(String html) { if (StrUtils.isBlank(html)) return null; Elements es = Jsoup.parseBodyFragment(html).select("img"); if (es != null && es.size() > 0) { String src = es.first().attr("src"); return StrUtils.isBlank(src) ? null : src; } return null; }
@Override public Document getNextPage(Document doc) throws IOException { int offset = Integer.parseInt(doc.getElementsByTag("posts").first().attr("offset")); int num = Integer.parseInt(doc.getElementsByTag("posts").first().attr("count")); if (offset + 100 > num) { return null; } return Http.url(getPage(offset / 100 + 1)).get(); }
public String getAlbumTitle(URL url) throws MalformedURLException { try { // Attempt to use album title as GID Document doc = getFirstPage(); Elements elems = doc.select(".albumName"); return getHost() + "_" + elems.first().text(); } catch (Exception e) { // Fall back to default album naming convention LOGGER.warn("Failed to get album title from " + url, e); } return super.getAlbumTitle(url); }
@Override public Document getNextPage(Document doc) throws IOException { sleep(1000); Element elem = doc.select("div[id=topnav] > nav.cc-nav > a.cc-next").first(); if (elem == null) { throw new IOException("No more pages"); } String nextPage = elem.attr("href"); return Http.url(nextPage).get(); }
@Override public String getAlbumTitle(URL url) throws MalformedURLException { try { // Attempt to use album title as GID return getHost() + "_" + getGID(url) + "_" + getFirstPage().select("title").first().text().replaceAll(" ", "_"); } catch (IOException e) { // Fall back to default album naming convention LOGGER.info("Unable to find title at " + url); } return super.getAlbumTitle(url); }
@Override public String getAlbumTitle(URL url) throws MalformedURLException { try { Document doc = getFirstPage(); String title = doc.select("div[id=main] > table.listTable > tbody > tr > td.listLong").first().text(); return getHost() + "_" + title + "_" + getGID(url); } catch (Exception e) { // Fall back to default album naming convention LOGGER.warn("Failed to get album title from " + url, e); } return super.getAlbumTitle(url); }
@Override public Document getNextPage(Document doc) throws IOException { // Find next page Elements hrefs = doc.select("a.next"); if (hrefs.isEmpty()) { throw new IOException("No more pages"); } String nextUrl = "http://www.bcfakes.com" + hrefs.first().attr("href"); sleep(500); return Http.url(nextUrl).get(); }
@Override public Document getNextPage(Document doc) throws IOException { // Find next page Elements hrefs = doc.select("a.pagination_current + a.pagination_link"); if (hrefs.isEmpty()) { throw new IOException("No more pages"); } String nextUrl = "http://www.imagebam.com" + hrefs.first().attr("href"); sleep(500); return Http.url(nextUrl).get(); }
@Override public String getAlbumTitle(URL url) throws MalformedURLException { try { Document doc = getFirstPage(); String title = doc.select("h3 > strong").first().text(); // profile name return getHost() + "_" + title + "_" + getGID(url); } catch (Exception e) { // Fall back to default album naming convention LOGGER.warn("Failed to get album title from " + url, e); } return super.getAlbumTitle(url); }
@Override public Document getNextPage(Document doc) throws IOException { // Find next page Elements hrefs = doc.select("a[title=\"Next page\"]"); if (hrefs.isEmpty()) { throw new IOException("No more pages"); } String nextUrl = "https://jabarchives.com" + hrefs.first().attr("href"); sleep(500); return Http.url(nextUrl).get(); }
private static int parsePages(Document d, String body) throws ParseException { try { Elements es = d.getElementsByClass("ptt").first().child(0).child(0).children(); return Integer.parseInt(es.get(es.size() - 2).text().trim()); } catch (Throwable e) { ExceptionUtils.throwIfFatal(e); throw new ParseException("Can't parse gallery list pages", body); } }
/** * Parse preview pages with html parser */ public static int parsePreviewPages(Document document, String body) throws ParseException { try { Elements elements = document.getElementsByClass("ptt").first().child(0).child(0).children(); return Integer.parseInt(elements.get(elements.size() - 2).text()); } catch (Throwable e) { ExceptionUtils.throwIfFatal(e); e.printStackTrace(); throw new ParseException("Can't parse preview pages", body); } }
@Override public Document getNextPage(Document doc) throws IOException { // Find next page Elements nextPageUrl = doc.select("a.right"); if (nextPageUrl.isEmpty()) { throw new IOException("No more pages"); } String nextUrl = urlBase + nextPageUrl.first().attr("href"); sleep(500); Document nextPage = Http.url(nextUrl).cookies(cookies).get(); return nextPage; }