/** Parse HTML into a Document. The parser will make a sensible, balanced document tree out of any HTML. @param html HTML to parse @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur before the HTML declares a {@code <base href>} tag. @return sane HTML */ public static Document parse(String html, String baseUri) { return Parser.parse(html, baseUri); }
/** Parse HTML into a Document. As no base URI is specified, absolute URL detection relies on the HTML including a {@code <base href>} tag. @param html HTML to parse @return sane HTML @see #parse(String, String) */ public static Document parse(String html) { return Parser.parse(html, ""); }
/** * @param bodyHtml HTML to parse * @param baseUri baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. * * @return parsed Document * @deprecated Use {@link #parseBodyFragment} or {@link #parseFragment} instead. */ public static Document parseBodyFragmentRelaxed(String bodyHtml, String baseUri) { return parse(bodyHtml, baseUri); }
@Override public List<List<String>> getVolumeTitleAndUrlOnMainPage( String urlString, String allPageString ) { // combine volumeList and urlList into combinationList, return it. List<List<String>> combinationList = new ArrayList<List<String>>(); List<String> urlList = new ArrayList<String>(); List<String> volumeList = new ArrayList<String>(); int volumeCount = 0; Document nodes = Parser.parse(allPageString, urlString); for (Element e : nodes.select(".serialise_list.Blue_link2 li>a")) { String strUrl = pageBaseURL + e.attr("href"); String strChapterName = e.text(); urlList.add(strUrl); volumeList.add(strChapterName); volumeCount++; } totalVolume = volumeCount; Common.debugPrintln( "共有" + totalVolume + "集" ); combinationList.add( volumeList ); combinationList.add( urlList ); return combinationList; } }
@Override public String getTitleOnMainPage( String urlString, String allPageString ) { Document nodes = Parser.parse(allPageString, urlString); String ret = nodes.getElementsByTag("title").text().split(" ")[0]; if (ret.length() == 0) Common.errorReport("取得標題失敗!值為空"); return ret; }
@Override public String getTitleOnMainPage( String urlString, String allPageString ) { Document nodes = Parser.parse(allPageString, urlString); String title = nodes.title(); if (title.length() == 0) Common.errorReport("取得標題失敗!值為空"); // 大家的玩具漫畫,動畫,在線漫畫 - 8comic.com 無限動漫 // 我的英雄學院漫畫,動畫,在線漫畫 綠谷出久,歐爾麥特,爆豪勝己,麗日禦茶子,飯田天哉 - 8comic.com 無限動漫 // 盛氣凌人漫畫,動畫,在線漫畫 成瀨翔,町田由希 - 8comic.com 無限動漫 // 食戟之靈漫畫,動畫,在線漫畫 幸平創真,幸平城一郞,峰崎 - 8comic.com 無限動漫 // 聲之形漫畫,動畫,在線漫畫 西宮硝子,石田將也 - 8comic.com 無限動漫 // 一拳超人漫畫,動畫,在線漫畫 福克高,馬魯哥利 - 8comic.com 無限動漫 // 美食的俘虜免費漫畫,動畫,線上觀看 - 免費漫畫區 阿虜,小松 - 無限動漫狂熱社群 - 8comic.com comicbus.com //Pattern titlePattern = Pattern.compile("(.+)漫畫,動畫,在線漫畫\\s+.*\\s+- 8comic\\.com 無限動漫"); Pattern titlePattern = Pattern.compile("(.+)免費漫畫,動畫,.+8comic\\.com.+"); Matcher titleMatcher = titlePattern.matcher(title); if (!titleMatcher.find()) Common.errorReport("取得標題失敗!Regular Expression 無發擷取標題(頁面已改版?)"); return titleMatcher.group(1); // 0 means full }
@Override public List<List<String>> getVolumeTitleAndUrlOnMainPage( String urlString, String allPageString ) { Document nodes = Parser.parse(allPageString, urlString);