/** * First deletes models if deleteExistingData is true, then builds the appropriate metrics. * @throws ConfigurationException * @throws DaoException * @throws IOException * @throws WikiBrainException */ public void build() throws ConfigurationException, DaoException, IOException, WikiBrainException, InterruptedException { if (deleteExistingData) { deleteDataDirectories(); } buildConceptsIfNecessary(); LOG.info("building metric " + metricName); for (String name : getSubmetrics(metricName)) { buildMetric(name); } }
public synchronized SRMetric getMetric() throws ConfigurationException { return getMetric(metricName); }
public Config getMetricConfig() throws ConfigurationException { return getMetricConfig(metricName); }
SRBuilder builder = new SRBuilder(env, metric); if (cmd.hasOption("g")) { builder.setDatasetNames(Arrays.asList(cmd.getOptionValues("g"))); builder.setRowIdsFromFile(cmd.getOptionValue("p")); builder.setBuildCosimilarity(true); builder.setColIdsFromFile(cmd.getOptionValue("q")); builder.setBuildCosimilarity(true); builder.setValidMostSimilarIdsFromFile(cmd.getOptionValue("y")); builder.setBuildCosimilarity(true); builder.setSkipBuiltMetrics(true); builder.setDeleteExistingData(false); builder.setDeleteExistingData(Boolean.valueOf(cmd.getOptionValue("d"))); builder.setMode(Mode.valueOf(cmd.getOptionValue("o").toUpperCase())); builder.setLanguage(Language.getByLangCode(cmd.getOptionValue("l"))); builder.setMaxResults(Integer.valueOf(cmd.getOptionValue("r"))); builder.setCreateFakeGoldStandard(true); builder.build();
public void buildMetric(String name) throws ConfigurationException, DaoException, IOException, InterruptedException { String type = getMetricType(name); if (type.equals("densevector.word2vec")) { initWord2Vec(name); SRMetric metric = getMetric(name); if (type.equals("ensemble")) { ((EnsembleMetric)metric).setTrainSubmetrics(false); // Do it by hand Dataset ds = getDataset(); if (mode == Mode.SIMILARITY || mode == Mode.BOTH) { if (skipBuiltMetrics && metric.similarityIsTrained()) { LOG.info("metric " + name + " mostSimilar() is already trained... skipping"); } else { Config config = getMetricConfig(name); int n = maxResults * EnsembleMetric.SEARCH_MULTIPLIER; TIntSet validIds = validMostSimilarIds; validIds = readIds(path);
public void create(String path) throws ConfigurationException, DaoException, WikiBrainException, IOException, InterruptedException { SRBuilder builder = new SRBuilder(env, "word2vec", lang); builder.setSkipBuiltMetrics(true); builder.setCreateFakeGoldStandard(true); builder.build();
String type = getMetricType(parentName); Config config = getMetricConfig(parentName); List<String> toAdd = new ArrayList<String>(); if (type.equals("ensemble") || type.equals("simple-ensemble")) { for (String child : config.getStringList("metrics")) { toAdd.addAll(getSubmetrics(child)); toAdd.add(child); toAdd.addAll(getSubmetrics(config.getString("generator.basemetric"))); } else if (type.equals("milnewitten")) { toAdd.add(config.getString("inlink"));
private void buildConceptsIfNecessary() throws IOException, ConfigurationException, DaoException { boolean needsConcepts = false; for (String name : getSubmetrics(metricName)) { String type = getMetricType(name); if (type.equals("sparsevector.esa") || type.equals("sparsevector.mostsimilarconcepts")) { needsConcepts = true; } } if (!needsConcepts) { return; } File path = FileUtils.getFile( env.getConfiguration().get().getString("sr.concepts.path"), language.getLangCode() + ".txt" ); path.getParentFile().mkdirs(); // Check to see if concepts are already built if (path.isFile() && FileUtils.readLines(path).size() > 1) { return; } LOG.info("building concept file " + path.getAbsolutePath() + " for " + metricName); SRConceptSpaceGenerator gen = new SRConceptSpaceGenerator(language, env.getConfigurator().get(LocalLinkDao.class), env.getConfigurator().get(LocalPageDao.class)); gen.writeConcepts(path); LOG.info("finished creating concept file " + path.getAbsolutePath() + " with " + FileUtils.readLines(path).size() + " lines"); }
private void initWord2Vec(String name) throws ConfigurationException, IOException, DaoException, InterruptedException { Config config = getMetricConfig(name).getConfig("generator"); File model = Word2VecGenerator.getModelFile(config.getString("modelDir"), language); if (skipBuiltMetrics && model.isFile()) { return; File downloadPath = new File(localize(config.getString("binfile"))); if (!downloadPath.isFile()) { FileDownloader downloader = new FileDownloader(); downloader.download(new URL(localize(config.getString("url"))), downloadPath);
public String getMetricType() throws ConfigurationException { return getMetricType(metricName); }
/** * This method takes care to not load the metric itself, and just deal in names. * Once the metric is loaded, it has already accessed its data files. * @throws ConfigurationException */ public void deleteDataDirectories() throws ConfigurationException { for (String name : getSubmetrics(metricName)) { File dir = FileUtils.getFile(srDir, name, language.getLangCode()); if (dir.exists()) { LOG.info("deleting metric directory " + dir); FileUtils.deleteQuietly(dir); } } }
public void create(String path) throws ConfigurationException, DaoException, WikiBrainException, IOException, InterruptedException { SRBuilder builder = new SRBuilder(env, "prebuiltword2vec", lang); builder.setSkipBuiltMetrics(true); builder.setCreateFakeGoldStandard(true); builder.build();
public String getMetricType(String name) throws ConfigurationException { Config config = getMetricConfig(name); String type = config.getString("type"); if (type.equals("densevector") || type.equals("sparsevector")) { type += "." + config.getString("generator.type"); } return type; }