public static List<FileMatcher> getListByNames(List<String> listNames) { List<FileMatcher> listMatchers = new ArrayList<FileMatcher>(); for (String name : listNames) { listMatchers.add(getByName(name)); } return listMatchers; }
public Language getLanguage(String link) { int end = link.lastIndexOf("wiki"); if (end < 1) { throw new IllegalStateException("No language detected for " + link); } int beg; for (beg = end-1; beg >=0 && isLangChar(link.charAt(beg)); beg--) { // All work is done in loop condition. } return Language.getByLangCode(link.substring(beg + 1, end)); }
linkMatchers = new ArrayList<FileMatcher>(); for (String name : cmd.getOptionValues("n")) { FileMatcher matcher = FileMatcher.getByName(name); if (matcher == null) { System.err.println("Invalid matcher name: " + name + "\nValid matcher names: \n" + FileMatcher.getAllNames().toString()); System.exit(1); linkMatchers = FileMatcher.getListByNames(conf.getConf().get().getStringList("download.matcher"));
public List<File> getFiles(LanguageSet langs, FileMatcher ... matchers) { List<File> matches = new ArrayList<File>(); for (Language l : langs) { for (FileMatcher fm : matchers) { List<File> f = getFiles(l, fm); if (f.isEmpty()) { LOG.warn("no files matching language " + l + ", matcher " + fm.getName()); } matches.addAll(f); } } return matches; } public List<File> getFiles(Language language, FileMatcher ... matchers) {
/** * Return all links of a particular language the fits one of the patterns * @return hashmap with dump urls and names of dump type */ public Multimap<FileMatcher, DumpLinkInfo> getDumpFiles(List<String> links) throws IOException { Multimap<FileMatcher, DumpLinkInfo> dumpLinks = HashMultimap.create(); Map<String, String> md5s = getMd5Sums(links); for(FileMatcher linkMatcher : matchers){ List<String> results = linkMatcher.match(links); if (!results.isEmpty()) { for (String url : results){ URL linkURL = new URL(BASEURL_STRING + url); DumpLinkInfo linkInfo = new DumpLinkInfo(lang, dumpDate, linkMatcher, linkURL, linkMatcher.getNumber(url)); linkInfo.setMd5(md5s.get(linkInfo.getDownloadName())); dumpLinks.put(linkMatcher, linkInfo); } } } return dumpLinks; }
public SqlLinksLoader(LocalLinkDao dao, LocalPageDao pageDao, MetaInfoDao metaDao, File file, LocalLinkSet existing) throws DaoException { this.dao = dao; this.metaDao = metaDao; this.pageDao = pageDao; this.sqlDump = file; this.language = FileMatcher.LINK_SQL.getLanguage(file.getAbsolutePath()); int n = dao.getCount(new DaoFilter().setLanguages(language)); n = Math.max(10000, n); n *= 2 * 3; // guess that there will be twice as many links as there are now, to be safe, array size should be 3 times as big. LOG.info("guessing at size of array at " + n); this.existing = existing; }
/** * Get MD5 of the dump of the specified language and dumpDate. * Maps download name to MD5 sum. * @param links * @return * @throws IOException */ protected Map<String, String> getMd5Sums(List<String> links) throws IOException { HashMap<String, String> md5s = new HashMap<String, String>(); if (links.isEmpty()) { return md5s; } FileMatcher md5Matcher = FileMatcher.MD5; URL md5Url = new URL(BASEURL_STRING + md5Matcher.match(links).get(0)); List<String> lines = IOUtils.readLines(md5Url.openStream(), "UTF-8"); for (String line : lines) { String[] parsedInfo = line.split("\\W{2}"); String md5 = parsedInfo[0]; String fileName = parsedInfo[1]; md5s.put(fileName, md5); } return md5s; }
public static List<File> getFiles(Language lang, FileMatcher fm, Configuration configuration) { File downloadPath = new File(configuration.get().getString("download.path")); if (downloadPath == null) { throw new IllegalArgumentException("missing configuration for download.path"); } if (LOG != null) LOG.debug("scanning download path " + downloadPath + " for files"); List<File> matchingFiles = new ArrayList<File>(); File langDir = new File(downloadPath, lang.getLangCode()); if (!langDir.isDirectory()) { return matchingFiles; } String mostRecent = null; for (File dateDir : langDir.listFiles((FileFilter) DirectoryFileFilter.INSTANCE)) { if (!dateDir.isDirectory()) { continue; } // skip if older than most recent if (mostRecent != null && dateDir.getName().compareTo(mostRecent) < 0) { continue; } List<File> lf = fm.matchFiles(Arrays.asList(dateDir.listFiles())); if (!lf.isEmpty()) { mostRecent = dateDir.getName(); matchingFiles = lf; } } return matchingFiles; }
public List<String> getLangLinks() throws WikiBrainException, IOException, ParseException { List<String> result = new ArrayList<String>(); Map<String, Multimap<FileMatcher, DumpLinkInfo>> dumpLinks = this.getDumps(); for (String dumpDate : dumpLinks.keySet()) { for (FileMatcher linkMatcher : dumpLinks.get(dumpDate).keySet()) { for (DumpLinkInfo linkInfo : dumpLinks.get(dumpDate).get(linkMatcher)) { result.add(linkInfo.getLanguage().getLangCode() + "\t" + linkInfo.getDate() + "\t" + linkInfo.getLinkMatcher().getName() + "\t" + linkInfo.getCounter() + "\t" + linkInfo.getUrl() + "\t" + linkInfo.getMd5()); } } } return result; }
public SqlLinksLoader(LocalLinkDao dao, LocalPageDao pageDao, MetaInfoDao metaDao, File file, LocalLinkSet existing) throws DaoException { this.dao = dao; this.metaDao = metaDao; this.pageDao = pageDao; this.sqlDump = file; this.language = FileMatcher.LINK_SQL.getLanguage(file.getAbsolutePath()); int n = dao.getCount(new DaoFilter().setLanguages(language)); n = Math.max(10000, n); n *= 2 * 3; // guess that there will be twice as many links as there are now, to be safe, array size should be 3 times as big. LOG.info("guessing at size of array at " + n); this.existing = existing; }
public DumpLinkInfo(String langCode, String date, String linkMatcher, String url, String md5, int counter) throws MalformedURLException { this.language = Language.getByLangCode(langCode); this.date = date; this.linkMatcher = FileMatcher.getByName(linkMatcher); this.url = new URL(url); this.md5 = md5; this.counter = counter; }
/** * Expects file name format starting with lang + "wiki" for example, "enwiki" * @param file */ public void load(final File file) { final Language lang = FileMatcher.ARTICLES.getLanguage(file.getAbsolutePath()); if (!keepProcessingArticles(lang)) { return; } DumpSplitter parser = new DumpSplitter(file); ParallelForEach.iterate( parser.iterator(), WpThreadUtils.getMaxThreads(), 1000, new Procedure<String>() { @Override public void call(String page) throws Exception { try { processOnePage(file, lang, page); } catch (WpParseException e) { LOG.warn("parsing of " + file.getPath() + " failed:", e); } } }, Integer.MAX_VALUE ); }
/** * Expects file name format starting with lang + "wiki" for example, "enwiki" * @param file */ public void load(final File file) { final Language lang = FileMatcher.ARTICLES.getLanguage(file.getAbsolutePath()); if (!keepProcessingArticles(lang)) { return; } DumpSplitter parser = new DumpSplitter(file); ParallelForEach.iterate( parser.iterator(), WpThreadUtils.getMaxThreads(), 1000, new Procedure<String>() { @Override public void call(String page) throws Exception { try { processOnePage(file, lang, page); } catch (WpParseException e) { LOG.warn("parsing of " + file.getPath() + " failed:", e); } } }, Integer.MAX_VALUE ); }