public static String getHtmlBody(String html) { Preconditions.checkNotNull(html, "input cannot be null."); return Regexps.firstMatch(HTML_BODY, html); }
private static String getAttribute(Pattern pattern, String content) { String str = Regexps.firstMatch(pattern, content, 2); str = str == null ? "" : str.replace('\"', ' ').trim(); return TextUtil.convertAmpersandStrings(str); }
/** * returns a map with attributes of an xml line. For example if [content] is `<Foo a="one" * b="two">` and [element] is `Foo` it returns [a:one b:two] Map. It only check the first match in * the content. */ public static Map<String, String> getAttributes(String content, String elementName) { elementName = elementName.trim(); Pattern p = Pattern.compile("(<" + elementName + ")" + "(.+?)" + "(>)", Pattern.CASE_INSENSITIVE | Pattern.DOTALL); String elementLine = Regexps.firstMatch(p, content); Map<String, String> attributes = new HashMap<>(); if (elementLine == null) { return attributes; } Matcher m = attributePattern.matcher(elementLine); while (m.find()) { attributes.put(m.group(1), m.group(3)); } return attributes; }
public static WebDocument fromText(String meta, List<String> pageData) { String url = Regexps.firstMatch(urlPattern, meta, 2); String id = url.replaceAll("http://|https://", ""); String source = Regexps.firstMatch(sourcePattern, meta, 2); String crawlDate = Regexps.firstMatch(crawlDatePattern, meta, 2); String labels = getAttribute(labelPattern, meta); String category = getAttribute(categoryPattern, meta); String title = getAttribute(titlePattern, meta); int i = source.lastIndexOf("/"); if (i >= 0 && i < source.length()) { source = source.substring(i + 1); } return new WebDocument(source, id, title, pageData, url, crawlDate, labels, category); }