/** * The main entry point of the code. */ public static void main(String[] args) throws IOException { forceTrack("Processing treebanks"); List<Pair<CoreMap, Collection<Pair<Span, Span>>>> trainingData = new ArrayList<>(); trainingData.addAll(processDirectory("WSJ", new File("/home/gabor/lib/data/penn_treebank/wsj"))); trainingData.addAll(processDirectory("Brown", new File("/home/gabor/lib/data/penn_treebank/brown"))); endTrack("Processing treebanks"); forceTrack("Training"); log("dataset size: " + trainingData.size()); ClauseSplitter.train( trainingData.stream(), new File("/home/gabor/tmp/clauseSearcher.ser.gz"), new File("/home/gabor/tmp/clauseSearcherData.tab.gz")); endTrack("Training"); // Execution.fillOptions(CreateClauseDataset.class, args); // // new CreateClauseDataset().runAndExit(in, System.err, code -> code); } }
default Accuracy computeAccuracy(Stream<Pair<KBPInput, String>> examples, Optional<PrintStream> predictOut) { forceTrack("Accuracy"); Accuracy accuracy = new Accuracy(); AtomicInteger testI = new AtomicInteger(0); DecimalFormat confidenceFormat = new DecimalFormat("0.0000"); forceTrack("Featurizing"); examples.parallel().map(example -> { Pair<String, Double> predicted = this.classify(example.first); synchronized (accuracy) { accuracy.predict(Collections.singleton(predicted.first), Collections.singleton(example.second)); } if (testI.incrementAndGet() % 1000 == 0) { log(KBPRelationExtractor.class, "[" + testI.get() + "] " + accuracy.toOneLineString()); } return predicted.first + "\t" + confidenceFormat.format(predicted.second); }) .forEachOrdered(line -> { if (predictOut.isPresent()) { predictOut.get().println(line); } }); endTrack("Featurizing"); log(accuracy.toString()); endTrack("Accuracy"); return accuracy; }
forceTrack("Reading Constituents"); Map<String, Map<KEY, T>> combinedMapping = Generics.newHashMap(); try { forceTrack("Clearing Destination"); if (!destination.cacheDir.exists() && !destination.cacheDir.mkdirs()) { throw new RuntimeException("Could not create cache dir for destination (data is intact): " + destination.cacheDir); forceTrack("Writing New Files"); try { for (Entry<String, Map<KEY, T>> blockEntry : combinedMapping.entrySet()) {
forceTrack("Training inference"); trainingData.forEach(rawExample -> { forceTrack("Training"); Classifier<ClauseClassifierLabel,String> fullClassifier = factory.trainClassifier(dataset); endTrack("Training"); forceTrack("Training accuracy"); dataset.randomize(42L); Util.dumpAccuracy(fullClassifier, dataset); forceTrack(numFolds + " fold cross-validation"); for (int fold = 0; fold < numFolds; ++fold) { forceTrack("Fold " + (fold + 1)); forceTrack("Training"); Pair<GeneralDataset<ClauseClassifierLabel, String>, GeneralDataset<ClauseClassifierLabel, String>> foldData = dataset.splitOutFold(fold, numFolds); Classifier<ClauseClassifierLabel, String> classifier = factory.trainClassifier(foldData.first); endTrack("Training"); forceTrack("Test"); Util.dumpAccuracy(classifier, foldData.second); endTrack("Test");
forceTrack("Processing file " + file.getAbsolutePath() + " ... writing to " + finalOutputFilename); forceTrack("Annotating file " + file.getAbsoluteFile()); try { annotate(annotation);
forceTrack("Test data"); List<Pair<KBPInput, String>> testExamples = KBPRelationExtractor.readDataset(TEST_FILE); log.info("Read " + testExamples.size() + " examples"); forceTrack("Training data"); List<Pair<KBPInput, String>> trainExamples = KBPRelationExtractor.readDataset(TRAIN_FILE); log.info("Read " + trainExamples.size() + " examples"); forceTrack("Creating dataset"); RVFDataset<String, String> dataset = new RVFDataset<>(); final AtomicInteger i = new AtomicInteger(0);
default Accuracy computeAccuracy(Stream<Pair<KBPInput, String>> examples, Optional<PrintStream> predictOut) { forceTrack("Accuracy"); Accuracy accuracy = new Accuracy(); AtomicInteger testI = new AtomicInteger(0); DecimalFormat confidenceFormat = new DecimalFormat("0.0000"); forceTrack("Featurizing"); examples.parallel().map(example -> { Pair<String, Double> predicted = this.classify(example.first); synchronized (accuracy) { accuracy.predict(Collections.singleton(predicted.first), Collections.singleton(example.second)); } if (testI.incrementAndGet() % 1000 == 0) { log(KBPRelationExtractor.class, "[" + testI.get() + "] " + accuracy.toOneLineString()); } return predicted.first + "\t" + confidenceFormat.format(predicted.second); }) .forEachOrdered(line -> { if (predictOut.isPresent()) { predictOut.get().println(line); } }); endTrack("Featurizing"); log(accuracy.toString()); endTrack("Accuracy"); return accuracy; }
forceTrack("Training inference"); trainingData.forEach(rawExample -> { forceTrack("Training"); Classifier<ClauseClassifierLabel,String> fullClassifier = factory.trainClassifier(dataset); endTrack("Training"); forceTrack("Training accuracy"); dataset.randomize(42L); Util.dumpAccuracy(fullClassifier, dataset); forceTrack(numFolds + " fold cross-validation"); for (int fold = 0; fold < numFolds; ++fold) { forceTrack("Fold " + (fold + 1)); forceTrack("Training"); Pair<GeneralDataset<ClauseClassifierLabel, String>, GeneralDataset<ClauseClassifierLabel, String>> foldData = dataset.splitOutFold(fold, numFolds); Classifier<ClauseClassifierLabel, String> classifier = factory.trainClassifier(foldData.first); endTrack("Training"); forceTrack("Test"); Util.dumpAccuracy(classifier, foldData.second); endTrack("Test");
forceTrack("Featurizing"); RVFDataset<SentimentClass, String> dataset = new RVFDataset<>(); AtomicInteger datasize = new AtomicInteger(0); forceTrack("Training"); if (featureCountThreshold > 1) { dataset.applyFeatureCountThreshold(featureCountThreshold); forceTrack("Evaluating"); factory.setVerbose(false); double sumAccuracy = 0.0;
forceTrack("Processing " + name);
forceTrack("Test data"); List<Pair<KBPInput, String>> testExamples = KBPRelationExtractor.readDataset(TEST_FILE); log.info("Read " + testExamples.size() + " examples"); forceTrack("Training data"); List<Pair<KBPInput, String>> trainExamples = KBPRelationExtractor.readDataset(TRAIN_FILE); log.info("Read " + trainExamples.size() + " examples"); forceTrack("Creating dataset"); RVFDataset<String, String> dataset = new RVFDataset<>(); final AtomicInteger i = new AtomicInteger(0);