/** * Separate digits and non digits as Strings. Such as: * <pre> * A12 -> "A" "12" * 1A12'ye -> "1" "A" "12" "'ye" * </pre> * * @param s input. * @return separated list of numerical and non numerical tokens. */ public static List<String> separateNumbers(String s) { return Regexps.allMatches(NUMBER_SEPARATION, s); }
public static String getHtmlBody(String html) { Preconditions.checkNotNull(html, "input cannot be null."); return Regexps.firstMatch(HTML_BODY, html); }
/** * replaces all special html Strings such as(&....; or &#dddd;) with their original characters. * * @param input input which may contain html specific strings. * @return cleaned input. */ public static String convertAmpersandStrings(String input) { return Regexps.replaceMap(AMPERSAND_PATTERN.matcher(input), HTML_STRING_TO_CHAR_MAP_FULL); }
String s = TextIO.loadUtfAsString(file); List<String> matches = Regexps.getMatchesForGroup(s, p1, 2); String name = URLDecoder.decode(file.toFile().getName().replaceAll("\\.html", ""), "utf-8"); name = name.toLowerCase(TurkishAlphabet.TR);
/** * loads float array from file with format: [1 2 3] [4 5 6] */ public static float[][] loadFromText(File input) throws IOException { String wholeThing = new SimpleTextReader(input, "UTF-8").asString(); List<String> featureBlocks = Regexps.firstGroupMatches(FEATURE_LINES_PATTERN, wholeThing); float[][] result = new float[featureBlocks.size()][]; int i = 0; for (String featureBlock : featureBlocks) { result[i] = FloatArrays.fromString(featureBlock, " "); i++; } return result; }
public static List<String> getSingleLineElementData(String allContent, String elementName) { elementName = elementName.trim().replaceAll("<>", ""); Pattern p = Pattern.compile("(<" + elementName + ")" + "(.+?)" + "(>)", Pattern.CASE_INSENSITIVE | Pattern.DOTALL); return Regexps.allMatches(p, allContent); }
private static String getAttribute(Pattern pattern, String content) { String str = Regexps.firstMatch(pattern, content, 2); str = str == null ? "" : str.replace('\"', ' ').trim(); return TextUtil.convertAmpersandStrings(str); }
public static List<String> getElementChunks(String allContent, String elementName) { elementName = elementName.trim().replaceAll("<>", ""); Pattern p = Pattern.compile("(<" + elementName + ")" + "(.+?)" + "(</" + elementName + ">)", Pattern.CASE_INSENSITIVE | Pattern.DOTALL); return Regexps.allMatches(p, allContent); }
/** * returns a map with attributes of an xml line. For example if [content] is `<Foo a="one" * b="two">` and [element] is `Foo` it returns [a:one b:two] Map. It only check the first match in * the content. */ public static Map<String, String> getAttributes(String content, String elementName) { elementName = elementName.trim(); Pattern p = Pattern.compile("(<" + elementName + ")" + "(.+?)" + "(>)", Pattern.CASE_INSENSITIVE | Pattern.DOTALL); String elementLine = Regexps.firstMatch(p, content); Map<String, String> attributes = new HashMap<>(); if (elementLine == null) { return attributes; } Matcher m = attributePattern.matcher(elementLine); while (m.find()) { attributes.put(m.group(1), m.group(3)); } return attributes; }
public static String reduceHtml(String htmlToReduce) { String htmlBody = getHtmlBody(htmlToReduce); if (htmlBody == null) { Log.warn("Cannot get html body. "); return htmlToReduce; } List<String> parts = Regexps.allMatches(HTML_META_CONTENT_TAG, htmlToReduce); return HTML_START + "<html><head>" + Joiner.on(" ").join(parts) + "</head>\n" + cleanScripts(htmlBody) + "</html>"; }
public static WebDocument fromText(String meta, List<String> pageData) { String url = Regexps.firstMatch(urlPattern, meta, 2); String id = url.replaceAll("http://|https://", ""); String source = Regexps.firstMatch(sourcePattern, meta, 2); String crawlDate = Regexps.firstMatch(crawlDatePattern, meta, 2); String labels = getAttribute(labelPattern, meta); String category = getAttribute(categoryPattern, meta); String title = getAttribute(titlePattern, meta); int i = source.lastIndexOf("/"); if (i >= 0 && i < source.length()) { source = source.substring(i + 1); } return new WebDocument(source, id, title, pageData, url, crawlDate, labels, category); }
continue; List<String> tokens = Regexps.allMatches(splitPattern, line); List<NerToken> nerTokens = new ArrayList<>(tokens.size()); int index = 0;