/** * Scans a character sequence for URIs. Then add all unique domain strings * derived from those found URIs to the supplied HashSet. * <p> * This function calls scanContentForHosts() to grab all the host strings. * Then it calls domainFromHost() on each host string found to distill them * to their basic "registrar" domains. * * @param domains a HashSet to be populated with all domain strings found in * the content * @param content a character sequence to be scanned for URIs * @return newDomains the domains which were extracted */ static public HashSet<String> scanContentForDomains(HashSet<String> domains, CharSequence content) { HashSet<String> newDomains = new HashSet<String>(); HashSet<String> hosts = scanContentForHosts(content); for (final String host:hosts) { final String domain = domainFromHost(host); if (null != domain) { if (false == domains.contains(domain)) { newDomains.add(domain); } } } return newDomains; }
/** * Extracts and returns the host portion of URI string. * * This function uses java.net.URI. * * @param uriStr a string containing a URI * @return the host portion of the supplied URI, null if no host string * could be found */ static protected String hostFromUriStr(String uriStr) { debugOut("hostFromUriStr(\"" + uriStr + "\")"); String host = null; URI uri; try { uri = new URI(uriStr); host = uri.getHost(); } catch (URISyntaxException e) { debugOut(e.getMessage()); } return host; }
String host = hostFromUriStr(found); if (null != host) { host = host.toLowerCase(); while (mat.find()) { String found = mat.group(); debugOut("******** mailfound=\"" + found + "\""); found = "mailto://" + found; debugOut("*******6 mailfoundfound=\"" + found + "\" after cleanup 6"); String host = hostFromUriStr(found); if (null != host) {
HashSet<String> newDom = URIScanner.scanContentForDomains(domains, part.getContent().toString());
/** * <p> * Scans a character sequence for URIs. Then add all unique domain strings * derived from those found URIs to the supplied HashSet. * </p> * <p> * This function calls scanContentForHosts() to grab all the host strings. * Then it calls domainFromHost() on each host string found to distill them * to their basic "registrar" domains. * </p> * * @param domains * a HashSet to be populated with all domain strings found in the * content * @param content * a character sequence to be scanned for URIs * @return newDomains the domains which were extracted */ public static HashSet<String> scanContentForDomains(HashSet<String> domains, CharSequence content) { HashSet<String> hosts = scanContentForHosts(content); return hosts.stream() .map(URIScanner::domainFromHost) .filter(Objects::nonNull) .filter(domain -> !domains.contains(domain)) .collect(Collectors.toCollection(HashSet::new)); }
String host = hostFromUriStr(found); if (null != host) { host = host.toLowerCase(Locale.US); LOGGER.debug("*******6 mailfoundfound=\"{}\" after cleanup 6", found); String host = hostFromUriStr(found); if (null != host) {
HashSet<String> newDom = URIScanner.scanContentForDomains(domains, part.getContent().toString());
debugOut("domainFromHost(\"" + host + "\")"); String domain = null; Matcher mat; debugOut("domain=\"" + domain + "\""); return domain; if (TLDLookup.isThreePartTLD(tld)) { domain = mat.group(1); debugOut("domain=\"" + domain + ", tld=\"" + tld + "\""); return domain; if (TLDLookup.isTwoPartTLD(tld)) { domain = mat.group(1); debugOut("domain=\"" + domain + ", tld=\"" + tld + "\""); return domain; String tld = mat.group(2); domain = mat.group(1); debugOut("domain=\"" + domain + ", tld=\"" + tld + "\""); return domain;