org.jsoup.select.Elements java code examples

Refine search

private void initMap() throws Exception {
  industryMap = new HashMap<>();
  String target = URLMapper.COMPREHENSIVE_PAGE.toString();
  String content = request(new URL(target));
  Document doc = Jsoup.parse(content);
  Elements element = doc.getElementsByClass("second-nav")
      .get(1).children()
      .get(3).children()
      .get(3).children()
      .select("a");
  StringBuilder builder = new StringBuilder();
  for (Element ele : element) {
    if (!ele.hasAttr("title") || !ele.hasAttr("href")) continue;
    builder.append(ele.attr("href"));
    industryMap.put(ele.attr("title"),  new Industry(ele.attr("title"), builder.toString()));
    builder.delete(0, builder.length());
  }
}

Elements patent = document.select("us-patent-grant");
if (patent.size() > 0) {
  Elements e = patent.select("classification-national");
  if (e == null || e.size() == 0) {
    log.warn("Skipping patent - no classification-national");
    return null;
  Element e2 = e.first();
  Elements mainClassification = e2.select("main-classification");
  if (mainClassification == null || mainClassification.size() == 0) {
    log.warn("Skipping patent {} in document - no main classification");
    return null;
  String main = e2.select("main-classification").outerHtml().replaceAll("\n", "")
      .replaceAll("<main-classification>", "").replaceAll("</main-classification>", "")
      .replaceFirst(" ", ""); //Replace first space - not significant, always present. But SECOND space is important
  title = patent.select("invention-title").text();
  abstr = patent.select("abstract").text();
  claims = patent.select("claims").text();
  descr = patent.select("description").text();
} else {
  patent = document.select("PATDOC");
  if (patent.size() > 0) {
    title = patent.select("B540").first().text();
    abstr = patent.select("SDOAB").text();
    claims = patent.select("SDOCL").text();

@Override
public Document getNextPage(Document page) throws IOException {
  Elements nextPageLink = page.select("li.page_next > a");
  if (nextPageLink.isEmpty()){
    throw new IOException("No more pages");
  } else {
    URL nextURL = new URL(this.url, nextPageLink.first().attr("href"));
    return Http.url(nextURL).get();
  }
}

/**
 Get the last matched element.
 @return The last matched element, or <code>null</code> if contents is empty.
 */
public Element last() {
  return isEmpty() ? null : get(size() - 1);
}

/**
 Get the first matched element.
 @return The first matched element, or <code>null</code> if contents is empty.
 */
public Element first() {
  return isEmpty() ? null : get(0);
}

/**
 * Get the form element's value of the first matched element.
 * @return The form element's value, or empty if not set.
 * @see Element#val()
 */
public String val() {
  if (size() > 0)
    return first().val();
  else
    return "";
}

  logger.info("Trying to download from unknown video host " + videoPageurl);
  URL url = new URL(videoPageurl);
  Response response = Http.url(url).referrer(hqpornerVideoPageUrl).response();
  Document doc = response.parse();
  Elements endingWithMp4 = doc.select("[src$=.mp4]");
  if (!endingWithMp4.isEmpty()) {
    List<String> list = new ArrayList<>();
    endingWithMp4.forEach((e) -> list.add(e.attr("src")));
    return getBestQualityLink(list);
  String link = matchUrlByPattern(p3, doc.html());
  if (link != null) {
    return link;
  Elements allElementsWithSrc = doc.select("[src*=" + url.getHost() + "]"); //all urls from same host.
  allElementsWithSrc = allElementsWithSrc.select("[src~=/[A-Za-z0-9_-]+$]"); // remove links with extensions( .js).
  for (Element e : allElementsWithSrc) {
    Document d = Http.url(e.attr("src")).referrer(url.getHost()).get();
    link = matchUrlByPattern(p3, d.html());
    if (link != null) {
  logger.error("Unable to get video url using generic methods.");
logger.error("Unable to get video url using generic methods.");
return null;

  @Override
  public void rip() throws IOException {
    LOGGER.info("    Retrieving " + this.url.toExternalForm());
    Document doc = Http.url(this.url).get();
    Elements videos = doc.select("meta[name=twitter:player:stream]");
    if (videos.isEmpty()) {
      throw new IOException("Could not find twitter:player:stream at " + url);
    }
    String vidUrl = videos.first().attr("content");
    vidUrl = vidUrl.replaceAll("&amp;", "&");
    addURLToDownload(new URL(vidUrl), HOST + "_" + getGID(this.url));
    waitForThreads();
  }
}

  private void fetchImage() {
    try {
      Document doc = Http.url(this.url)
                .referrer(this.url)
                .get();
      // Find image
      Elements images = doc.select("#photoImageSection img");
      Element image = images.first();
      String imgsrc = image.attr("src");
      LOGGER.info("Found URL " + imgsrc + " via " + images.get(0));
      // Provide prefix and let the AbstractRipper "guess" the filename
      String prefix = "";
      if (Utils.getConfigBoolean("download.save_order", true)) {
        prefix = String.format("%03d_", index);
      }
      URL imgurl = new URL(url, imgsrc);
      addURLToDownload(imgurl, prefix);
    } catch (IOException e) {
      LOGGER.error("[!] Exception while loading/parsing " + this.url, e);
    }
  }
}

@Override
public List<String> getURLsFromPage(Document doc) {
  LOGGER.debug("Checking for urls");
  List<String> result = new ArrayList<>();
  if (!isVideoUrl(url)) {
   for (Element page : doc.select("div.items > div.item-container > a.item")) {
     String pageWithImageUrl = page.attr("href");
     try {
       String image = Http.url(new URL(pageWithImageUrl)).get().select("div.picture_container > a > img").attr("src");
       downloadFile(image);
     } catch (IOException e) {
       LOGGER.error("Was unable to load page " + pageWithImageUrl);
     }
   }
  } else {
    String imgUrl = doc.select("div.player-container > a").attr("href");
    downloadFile(imgUrl);
  }
  return result;
}

@Override
public void run() {
  try {
    Document page = Http.url(url).retries(RETRY_COUNT).get();
    String downloadUrl = page.select(".icon-download").attr("abs:href");
    if (downloadUrl.equals("")) {
      // This is here for pages with mp4s instead of images.
      downloadUrl = page.select("div > video > source").attr("src");
      if (!downloadUrl.equals("")) {
        throw new IOException("Could not find download url for image or video.");
      }
    }
    //If a valid download url was found.
    addURLToDownload(new URL(downloadUrl), getPrefix(index));
  } catch (IOException e) {
    LOGGER.error("Error downloadiong url " + url, e);
  }
}

  @Override
  public void rip() throws IOException {
    LOGGER.info("Retrieving " + this.url);
    Document doc = Http.url(url).get();
    Elements videos = doc.select(".wp-video > video > source");
    if (videos.isEmpty()) {
      throw new IOException("Could not find Embed code at " + url);
    }
    String vidUrl = videos.attr("src");
    addURLToDownload(new URL(vidUrl), HOST + "_" + getGID(this.url));
    waitForThreads();
  }
}

  @Override
  public void rip() throws IOException {
    LOGGER.info("Retrieving " + this.url);
    Document doc = Http.url(url).get();
    
    //Get user friendly filename from page title
    String title = doc.title();
    
    Elements script = doc.select("script");
    if (script.isEmpty()) {
      throw new IOException("Could not find script code at " + url);
    }
    //Regex assumes highest quality source is listed first
    Pattern p = Pattern.compile("\"source\":\"(.*?)\"");
    
    for (Element element : script) {
      Matcher m = p.matcher(element.data());
      if (m.find()){
        String vidUrl = m.group(1);
        addURLToDownload(new URL(vidUrl), HOST + "_" + title);
      }
    }
    waitForThreads();
  }
}

private URL parseRedditVideoMPD(String vidURL) {
  org.jsoup.nodes.Document doc = null;
  try {
    doc = Http.url(vidURL + "/DASHPlaylist.mpd").ignoreContentType().get();
    int largestHeight = 0;
    String baseURL = null;
    // Loops over all the videos and finds the one with the largest height and sets baseURL to the base url of that video
    for (org.jsoup.nodes.Element e : doc.select("MPD > Period > AdaptationSet > Representation")) {
      String height = e.attr("height");
      if (height.equals("")) {
        height = "0";
      }
      if (largestHeight < Integer.parseInt(height)) {
        largestHeight = Integer.parseInt(height);
        baseURL = doc.select("MPD > Period > AdaptationSet > Representation[height=" + height + "]").select("BaseURL").text();
      }
    }
    return new URL(vidURL + "/" + baseURL);
  } catch (IOException e) {
    e.printStackTrace();
  }
  return null;
}

public String getDescription() throws MalformedURLException, IOException{
  String baseURLp1 = "http://www.genenames.org/cgi-bin/quick_search.pl?.cgifields=type&type=equal&num=50&search=";
  String baseURLp2 = "&submit=Submit";
  URL url = new URL (baseURLp1 + name +baseURLp2);
  Document doc = Jsoup.parse(url, 20*1000);
  Elements tableClass = doc.select("table.quick_search");
  Element e = tableClass.select("tr:has(td)").first();
  String description = e.select("td:has(a) + td").text();
  return description;
}

Document doc = Jsoup.parse(new URL("http://www.bits4beats.it/"), 2000);
 Elements resultLinks = doc.select("a");
 System.out.println("number of links: " + resultLinks.size());
 for (Element link : resultLinks) {
   System.out.println();
   String href = link.attr("href");
   System.out.println("Title: " + link.text());
   System.out.println("Url: " + href);
 }

// let's find the iframe
Document document = Jsoup.parse(inputstream, "iso-8859-1", url);
Elements elements = document.select("iframe");
Element iframe = elements.first();
// now load the iframe
URL iframeUrl = new URL(iframe.absUrl("src"));
document = Jsoup.parse(iframeUrl, 15000);
// extract the div
Element div = document.getElementById("number_forecast");

  && mapResponse.get("WWW-Authenticate").startsWith("Basic ")
) {
  LOGGER.warn(
    "Basic Authentication detected.\n"
    + "Please define and enable authentication information in the panel Preferences.\n"
Elements elementsForm = Jsoup.parse(pageSource.toString()).select("form");
  result.append(form.attr("action"));
  result.append("\" method=\"");
  result.append(form.attr("method"));
  result.append("\" />");
  for (Element input: form.select("input")) {
if (!elementsForm.isEmpty()) {
  if (!PreferencesUtil.isParsingForm()) {
    if (connection.getResponseCode() != 200) {
      LOGGER.trace("Found "+ elementsForm.size() +" ignored <form> in HTML body:"+ result);
      LOGGER.trace("Found "+ elementsForm.size() +" <form> in HTML body while status 200 Success:"+ result);
    LOGGER.debug("Found "+ elementsForm.size() +" <form> in HTML body, adding input(s) to requests:"+ result);
.parse(pageSource.toString())
.select("input")
.select("[name=csrf_token], [name=csrfToken]")
.stream()

@Override
public String getAlbumTitle(URL url) throws MalformedURLException {
  try {
    // Attempt to use album title as GID
    Element titleElement = getFirstPage().select("meta[property=og:title]").first();
    String title = titleElement.attr("content");
    title = title.substring(title.lastIndexOf('/') + 1);
    return getHost() + "_" + title.trim();
  } catch (IOException e) {
    // Fall back to default album naming convention
    LOGGER.info("Unable to find title at " + url);
  }
  return super.getAlbumTitle(url);
}

  @Override
  public String getAlbumTitle(URL url) throws MalformedURLException {
    try {
      // Attempt to use album title and username as GID
      Document doc = getFirstPage();
      Element user = doc.select("a.author").first();
      String username = user.text();
      String path = url.getPath();
      Pattern p = Pattern.compile("^/photos/gallery/(.*)$");
      Matcher m = p.matcher(path);
      if (m.matches() && !username.isEmpty()) {
        return getHost() + "_" + username + "_" + m.group(1);
      }
    } catch (IOException | NullPointerException e) {
      // Fall back to default album naming convention
    }
    return super.getAlbumTitle(url);
  }
}

Javadoc

A list of Elements, with methods that act on every element in the list.

To get an Elements object, use the Element#select(String) method.

Most used methods

get
size
first
Get the first matched element.
text
Get the combined text of all the matched elements. Note that it is possible to get repeats if the ma
isEmpty
attr
Set an attribute on all matched elements.
select
Find matching elements within this element list.
remove
stream
html
Set the inner HTML of each matched element.
last
Get the last matched element.
iterator

Popular in Java

Parsing JSON documents to java classes using gson
getSharedPreferences (Context)
startActivity (Activity)
findViewById (Activity)
RandomAccessFile (java.io)
Allows reading from and writing to a file in a random-access manner. This is different from the uni-
BigInteger (java.math)
An immutable arbitrary-precision signed integer.FAST CRYPTOGRAPHY This implementation is efficient f
Servlet (javax.servlet)
Defines methods that all servlets must implement. A servlet is a small Java program that runs within
IOUtils (org.apache.commons.io)
General IO stream manipulation utilities. This class provides static utility methods for input/outpu
Filter (javax.servlet)
A filter is an object that performs filtering tasks on either the request to a resource (a servlet o
JFrame (javax.swing)
Top PhpStorm plugins

How to useElements in org.jsoup.select

Best Java code snippets using org.jsoup.select.Elements (Showing top 20 results out of 2,835)

Refine search

How to use
Elements
in
org.jsoup.select