private void initMap() throws Exception { industryMap = new HashMap<>(); String target = URLMapper.COMPREHENSIVE_PAGE.toString(); String content = request(new URL(target)); Document doc = Jsoup.parse(content); Elements element = doc.getElementsByClass("second-nav") .get(1).children() .get(3).children() .get(3).children() .select("a"); StringBuilder builder = new StringBuilder(); for (Element ele : element) { if (!ele.hasAttr("title") || !ele.hasAttr("href")) continue; builder.append(ele.attr("href")); industryMap.put(ele.attr("title"), new Industry(ele.attr("title"), builder.toString())); builder.delete(0, builder.length()); } }
Elements patent = document.select("us-patent-grant"); if (patent.size() > 0) { Elements e = patent.select("classification-national"); if (e == null || e.size() == 0) { log.warn("Skipping patent - no classification-national"); return null; Element e2 = e.first(); Elements mainClassification = e2.select("main-classification"); if (mainClassification == null || mainClassification.size() == 0) { log.warn("Skipping patent {} in document - no main classification"); return null; String main = e2.select("main-classification").outerHtml().replaceAll("\n", "") .replaceAll("<main-classification>", "").replaceAll("</main-classification>", "") .replaceFirst(" ", ""); //Replace first space - not significant, always present. But SECOND space is important title = patent.select("invention-title").text(); abstr = patent.select("abstract").text(); claims = patent.select("claims").text(); descr = patent.select("description").text(); } else { patent = document.select("PATDOC"); if (patent.size() > 0) { title = patent.select("B540").first().text(); abstr = patent.select("SDOAB").text(); claims = patent.select("SDOCL").text();
@Override public Document getNextPage(Document page) throws IOException { Elements nextPageLink = page.select("li.page_next > a"); if (nextPageLink.isEmpty()){ throw new IOException("No more pages"); } else { URL nextURL = new URL(this.url, nextPageLink.first().attr("href")); return Http.url(nextURL).get(); } }
logger.info("Trying to download from unknown video host " + videoPageurl); URL url = new URL(videoPageurl); Response response = Http.url(url).referrer(hqpornerVideoPageUrl).response(); Document doc = response.parse(); Elements endingWithMp4 = doc.select("[src$=.mp4]"); if (!endingWithMp4.isEmpty()) { List<String> list = new ArrayList<>(); endingWithMp4.forEach((e) -> list.add(e.attr("src"))); return getBestQualityLink(list); String link = matchUrlByPattern(p3, doc.html()); if (link != null) { return link; Elements allElementsWithSrc = doc.select("[src*=" + url.getHost() + "]"); //all urls from same host. allElementsWithSrc = allElementsWithSrc.select("[src~=/[A-Za-z0-9_-]+$]"); // remove links with extensions( .js). for (Element e : allElementsWithSrc) { Document d = Http.url(e.attr("src")).referrer(url.getHost()).get(); link = matchUrlByPattern(p3, d.html()); if (link != null) { logger.error("Unable to get video url using generic methods."); logger.error("Unable to get video url using generic methods."); return null;
@Override public void rip() throws IOException { LOGGER.info(" Retrieving " + this.url.toExternalForm()); Document doc = Http.url(this.url).get(); Elements videos = doc.select("meta[name=twitter:player:stream]"); if (videos.isEmpty()) { throw new IOException("Could not find twitter:player:stream at " + url); } String vidUrl = videos.first().attr("content"); vidUrl = vidUrl.replaceAll("&", "&"); addURLToDownload(new URL(vidUrl), HOST + "_" + getGID(this.url)); waitForThreads(); } }
private void fetchImage() { try { Document doc = Http.url(this.url) .referrer(this.url) .get(); // Find image Elements images = doc.select("#photoImageSection img"); Element image = images.first(); String imgsrc = image.attr("src"); LOGGER.info("Found URL " + imgsrc + " via " + images.get(0)); // Provide prefix and let the AbstractRipper "guess" the filename String prefix = ""; if (Utils.getConfigBoolean("download.save_order", true)) { prefix = String.format("%03d_", index); } URL imgurl = new URL(url, imgsrc); addURLToDownload(imgurl, prefix); } catch (IOException e) { LOGGER.error("[!] Exception while loading/parsing " + this.url, e); } } }
@Override public List<String> getURLsFromPage(Document doc) { LOGGER.debug("Checking for urls"); List<String> result = new ArrayList<>(); if (!isVideoUrl(url)) { for (Element page : doc.select("div.items > div.item-container > a.item")) { String pageWithImageUrl = page.attr("href"); try { String image = Http.url(new URL(pageWithImageUrl)).get().select("div.picture_container > a > img").attr("src"); downloadFile(image); } catch (IOException e) { LOGGER.error("Was unable to load page " + pageWithImageUrl); } } } else { String imgUrl = doc.select("div.player-container > a").attr("href"); downloadFile(imgUrl); } return result; }
@Override public void run() { try { Document page = Http.url(url).retries(RETRY_COUNT).get(); String downloadUrl = page.select(".icon-download").attr("abs:href"); if (downloadUrl.equals("")) { // This is here for pages with mp4s instead of images. downloadUrl = page.select("div > video > source").attr("src"); if (!downloadUrl.equals("")) { throw new IOException("Could not find download url for image or video."); } } //If a valid download url was found. addURLToDownload(new URL(downloadUrl), getPrefix(index)); } catch (IOException e) { LOGGER.error("Error downloadiong url " + url, e); } }
@Override public void rip() throws IOException { LOGGER.info("Retrieving " + this.url); Document doc = Http.url(url).get(); Elements videos = doc.select(".wp-video > video > source"); if (videos.isEmpty()) { throw new IOException("Could not find Embed code at " + url); } String vidUrl = videos.attr("src"); addURLToDownload(new URL(vidUrl), HOST + "_" + getGID(this.url)); waitForThreads(); } }
@Override public void rip() throws IOException { LOGGER.info("Retrieving " + this.url); Document doc = Http.url(url).get(); //Get user friendly filename from page title String title = doc.title(); Elements script = doc.select("script"); if (script.isEmpty()) { throw new IOException("Could not find script code at " + url); } //Regex assumes highest quality source is listed first Pattern p = Pattern.compile("\"source\":\"(.*?)\""); for (Element element : script) { Matcher m = p.matcher(element.data()); if (m.find()){ String vidUrl = m.group(1); addURLToDownload(new URL(vidUrl), HOST + "_" + title); } } waitForThreads(); } }
private URL parseRedditVideoMPD(String vidURL) { org.jsoup.nodes.Document doc = null; try { doc = Http.url(vidURL + "/DASHPlaylist.mpd").ignoreContentType().get(); int largestHeight = 0; String baseURL = null; // Loops over all the videos and finds the one with the largest height and sets baseURL to the base url of that video for (org.jsoup.nodes.Element e : doc.select("MPD > Period > AdaptationSet > Representation")) { String height = e.attr("height"); if (height.equals("")) { height = "0"; } if (largestHeight < Integer.parseInt(height)) { largestHeight = Integer.parseInt(height); baseURL = doc.select("MPD > Period > AdaptationSet > Representation[height=" + height + "]").select("BaseURL").text(); } } return new URL(vidURL + "/" + baseURL); } catch (IOException e) { e.printStackTrace(); } return null; }
public String getDescription() throws MalformedURLException, IOException{ String baseURLp1 = "http://www.genenames.org/cgi-bin/quick_search.pl?.cgifields=type&type=equal&num=50&search="; String baseURLp2 = "&submit=Submit"; URL url = new URL (baseURLp1 + name +baseURLp2); Document doc = Jsoup.parse(url, 20*1000); Elements tableClass = doc.select("table.quick_search"); Element e = tableClass.select("tr:has(td)").first(); String description = e.select("td:has(a) + td").text(); return description; }
Document doc = Jsoup.parse(new URL("http://www.bits4beats.it/"), 2000); Elements resultLinks = doc.select("a"); System.out.println("number of links: " + resultLinks.size()); for (Element link : resultLinks) { System.out.println(); String href = link.attr("href"); System.out.println("Title: " + link.text()); System.out.println("Url: " + href); }
// let's find the iframe Document document = Jsoup.parse(inputstream, "iso-8859-1", url); Elements elements = document.select("iframe"); Element iframe = elements.first(); // now load the iframe URL iframeUrl = new URL(iframe.absUrl("src")); document = Jsoup.parse(iframeUrl, 15000); // extract the div Element div = document.getElementById("number_forecast");
&& mapResponse.get("WWW-Authenticate").startsWith("Basic ") ) { LOGGER.warn( "Basic Authentication detected.\n" + "Please define and enable authentication information in the panel Preferences.\n" Elements elementsForm = Jsoup.parse(pageSource.toString()).select("form"); result.append(form.attr("action")); result.append("\" method=\""); result.append(form.attr("method")); result.append("\" />"); for (Element input: form.select("input")) { if (!elementsForm.isEmpty()) { if (!PreferencesUtil.isParsingForm()) { if (connection.getResponseCode() != 200) { LOGGER.trace("Found "+ elementsForm.size() +" ignored <form> in HTML body:"+ result); LOGGER.trace("Found "+ elementsForm.size() +" <form> in HTML body while status 200 Success:"+ result); LOGGER.debug("Found "+ elementsForm.size() +" <form> in HTML body, adding input(s) to requests:"+ result); .parse(pageSource.toString()) .select("input") .select("[name=csrf_token], [name=csrfToken]") .stream()
@Override public String getAlbumTitle(URL url) throws MalformedURLException { try { // Attempt to use album title as GID Element titleElement = getFirstPage().select("meta[property=og:title]").first(); String title = titleElement.attr("content"); title = title.substring(title.lastIndexOf('/') + 1); return getHost() + "_" + title.trim(); } catch (IOException e) { // Fall back to default album naming convention LOGGER.info("Unable to find title at " + url); } return super.getAlbumTitle(url); }
@Override public String getAlbumTitle(URL url) throws MalformedURLException { try { // Attempt to use album title and username as GID Document doc = getFirstPage(); Element user = doc.select("a.author").first(); String username = user.text(); String path = url.getPath(); Pattern p = Pattern.compile("^/photos/gallery/(.*)$"); Matcher m = p.matcher(path); if (m.matches() && !username.isEmpty()) { return getHost() + "_" + username + "_" + m.group(1); } } catch (IOException | NullPointerException e) { // Fall back to default album naming convention } return super.getAlbumTitle(url); } }