private static Iterator<Instance> constructIterator (File trainFile, File dataDir, boolean isList) throws IOException { if (isList) { return new FileListIterator (trainFile, dataDir, null, null, true); } else { return new LineGroupIterator (new FileReader (trainFile), Pattern.compile ("^\\s*$"), true); } }
public LineGroupIterator (Reader input, Pattern lineBoundaryRegex, boolean skipBoundary) { this.reader = new LineNumberReader (input); this.lineBoundaryRegex = lineBoundaryRegex; this.skipBoundary = skipBoundary; setNextLineGroup(); }
private static Iterator<Instance> constructIterator (File trainFile, File dataDir, boolean isList) throws IOException { if (isList) { return new FileListIterator (trainFile, dataDir, null, null, true); } else { return new LineGroupIterator (new FileReader (trainFile), Pattern.compile ("^\\s*$"), true); } }
public LineGroupIterator (Reader input, Pattern lineBoundaryRegex, boolean skipBoundary) { this.reader = new LineNumberReader (input); this.lineBoundaryRegex = lineBoundaryRegex; this.skipBoundary = skipBoundary; setNextLineGroup(); }
private static Iterator<Instance> constructIterator (File trainFile, File dataDir, boolean isList) throws IOException { if (isList) { return new FileListIterator (trainFile, dataDir, null, null, true); } else { return new LineGroupIterator (new FileReader (trainFile), Pattern.compile ("^\\s*$"), true); } }
public LineGroupIterator (Reader input, Pattern lineBoundaryRegex, boolean skipBoundary) { this.reader = new LineNumberReader (input); this.lineBoundaryRegex = lineBoundaryRegex; this.skipBoundary = skipBoundary; setNextLineGroup(); }
public TrainHMM(String trainingFilename, String testingFilename) throws IOException { ArrayList<Pipe> pipes = new ArrayList<Pipe>(); pipes.add(new SimpleTaggerSentence2TokenSequence()); pipes.add(new TokenSequence2FeatureSequence()); Pipe pipe = new SerialPipes(pipes); InstanceList trainingInstances = new InstanceList(pipe); InstanceList testingInstances = new InstanceList(pipe); trainingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true)); testingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(testingFilename)))), Pattern.compile("^\\s*$"), true)); HMM hmm = new HMM(pipe, null); hmm.addStatesForLabelsConnectedAsIn(trainingInstances); //hmm.addStatesForBiLabelsConnectedAsIn(trainingInstances); HMMTrainerByLikelihood trainer = new HMMTrainerByLikelihood(hmm); TransducerEvaluator trainingEvaluator = new PerClassAccuracyEvaluator(trainingInstances, "training"); TransducerEvaluator testingEvaluator = new PerClassAccuracyEvaluator(testingInstances, "testing"); trainer.train(trainingInstances, 10); trainingEvaluator.evaluate(trainer); testingEvaluator.evaluate(trainer); }
public Instance next () { assert (nextLineGroup != null); Instance carrier = new Instance (nextLineGroup, null, "linegroup"+groupIndex++, putBoundaryInSource ? nextBoundary : null); setNextLineGroup (); return carrier; }
public TrainHMM(String trainingFilename, String testingFilename) throws IOException { ArrayList<Pipe> pipes = new ArrayList<Pipe>(); pipes.add(new SimpleTaggerSentence2TokenSequence()); pipes.add(new TokenSequence2FeatureSequence()); Pipe pipe = new SerialPipes(pipes); InstanceList trainingInstances = new InstanceList(pipe); InstanceList testingInstances = new InstanceList(pipe); trainingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true)); testingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(testingFilename)))), Pattern.compile("^\\s*$"), true)); HMM hmm = new HMM(pipe, null); hmm.addStatesForLabelsConnectedAsIn(trainingInstances); //hmm.addStatesForBiLabelsConnectedAsIn(trainingInstances); HMMTrainerByLikelihood trainer = new HMMTrainerByLikelihood(hmm); TransducerEvaluator trainingEvaluator = new PerClassAccuracyEvaluator(trainingInstances, "training"); TransducerEvaluator testingEvaluator = new PerClassAccuracyEvaluator(testingInstances, "testing"); trainer.train(trainingInstances, 10); trainingEvaluator.evaluate(trainer); testingEvaluator.evaluate(trainer); }
public Instance next () { assert (nextLineGroup != null); Instance carrier = new Instance (nextLineGroup, null, "linegroup"+groupIndex++, putBoundaryInSource ? nextBoundary : null); setNextLineGroup (); return carrier; }
public TrainHMM(String trainingFilename, String testingFilename) throws IOException { ArrayList<Pipe> pipes = new ArrayList<Pipe>(); pipes.add(new SimpleTaggerSentence2TokenSequence()); pipes.add(new TokenSequence2FeatureSequence()); Pipe pipe = new SerialPipes(pipes); InstanceList trainingInstances = new InstanceList(pipe); InstanceList testingInstances = new InstanceList(pipe); trainingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true)); testingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(testingFilename)))), Pattern.compile("^\\s*$"), true)); HMM hmm = new HMM(pipe, null); hmm.addStatesForLabelsConnectedAsIn(trainingInstances); //hmm.addStatesForBiLabelsConnectedAsIn(trainingInstances); HMMTrainerByLikelihood trainer = new HMMTrainerByLikelihood(hmm); TransducerEvaluator trainingEvaluator = new PerClassAccuracyEvaluator(trainingInstances, "training"); TransducerEvaluator testingEvaluator = new PerClassAccuracyEvaluator(testingInstances, "testing"); trainer.train(trainingInstances, 10); trainingEvaluator.evaluate(trainer); testingEvaluator.evaluate(trainer); }
public Instance next () { assert (nextLineGroup != null); Instance carrier = new Instance (nextLineGroup, null, "linegroup"+groupIndex++, putBoundaryInSource ? nextBoundary : null); setNextLineGroup (); return carrier; }
public TestCRFPipe(String trainingFilename) throws IOException { ArrayList<Pipe> pipes = new ArrayList<Pipe>(); PrintWriter out = new PrintWriter("test.out"); int[][] conjunctions = new int[3][]; conjunctions[0] = new int[] { -1 }; conjunctions[1] = new int[] { 1 }; conjunctions[2] = new int[] { -2, -1 }; pipes.add(new SimpleTaggerSentence2TokenSequence()); //pipes.add(new FeaturesInWindow("PREV-", -1, 1)); //pipes.add(new FeaturesInWindow("NEXT-", 1, 2)); pipes.add(new OffsetConjunctions(conjunctions)); pipes.add(new TokenTextCharSuffix("C1=", 1)); pipes.add(new TokenTextCharSuffix("C2=", 2)); pipes.add(new TokenTextCharSuffix("C3=", 3)); pipes.add(new RegexMatches("CAPITALIZED", Pattern.compile("^\\p{Lu}.*"))); pipes.add(new RegexMatches("STARTSNUMBER", Pattern.compile("^[0-9].*"))); pipes.add(new RegexMatches("HYPHENATED", Pattern.compile(".*\\-.*"))); pipes.add(new RegexMatches("DOLLARSIGN", Pattern.compile("\\$.*"))); pipes.add(new TokenFirstPosition("FIRSTTOKEN")); pipes.add(new TokenSequence2FeatureVectorSequence()); pipes.add(new SequencePrintingPipe(out)); Pipe pipe = new SerialPipes(pipes); InstanceList trainingInstances = new InstanceList(pipe); trainingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true)); out.close(); }
public TestCRFPipe(String trainingFilename) throws IOException { ArrayList<Pipe> pipes = new ArrayList<Pipe>(); PrintWriter out = new PrintWriter("test.out"); int[][] conjunctions = new int[3][]; conjunctions[0] = new int[] { -1 }; conjunctions[1] = new int[] { 1 }; conjunctions[2] = new int[] { -2, -1 }; pipes.add(new SimpleTaggerSentence2TokenSequence()); //pipes.add(new FeaturesInWindow("PREV-", -1, 1)); //pipes.add(new FeaturesInWindow("NEXT-", 1, 2)); pipes.add(new OffsetConjunctions(conjunctions)); pipes.add(new TokenTextCharSuffix("C1=", 1)); pipes.add(new TokenTextCharSuffix("C2=", 2)); pipes.add(new TokenTextCharSuffix("C3=", 3)); pipes.add(new RegexMatches("CAPITALIZED", Pattern.compile("^\\p{Lu}.*"))); pipes.add(new RegexMatches("STARTSNUMBER", Pattern.compile("^[0-9].*"))); pipes.add(new RegexMatches("HYPHENATED", Pattern.compile(".*\\-.*"))); pipes.add(new RegexMatches("DOLLARSIGN", Pattern.compile("\\$.*"))); pipes.add(new TokenFirstPosition("FIRSTTOKEN")); pipes.add(new TokenSequence2FeatureVectorSequence()); pipes.add(new SequencePrintingPipe(out)); Pipe pipe = new SerialPipes(pipes); InstanceList trainingInstances = new InstanceList(pipe); trainingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true)); out.close(); }
public TestCRFPipe(String trainingFilename) throws IOException { ArrayList<Pipe> pipes = new ArrayList<Pipe>(); PrintWriter out = new PrintWriter("test.out"); int[][] conjunctions = new int[3][]; conjunctions[0] = new int[] { -1 }; conjunctions[1] = new int[] { 1 }; conjunctions[2] = new int[] { -2, -1 }; pipes.add(new SimpleTaggerSentence2TokenSequence()); //pipes.add(new FeaturesInWindow("PREV-", -1, 1)); //pipes.add(new FeaturesInWindow("NEXT-", 1, 2)); pipes.add(new OffsetConjunctions(conjunctions)); pipes.add(new TokenTextCharSuffix("C1=", 1)); pipes.add(new TokenTextCharSuffix("C2=", 2)); pipes.add(new TokenTextCharSuffix("C3=", 3)); pipes.add(new RegexMatches("CAPITALIZED", Pattern.compile("^\\p{Lu}.*"))); pipes.add(new RegexMatches("STARTSNUMBER", Pattern.compile("^[0-9].*"))); pipes.add(new RegexMatches("HYPHENATED", Pattern.compile(".*\\-.*"))); pipes.add(new RegexMatches("DOLLARSIGN", Pattern.compile("\\$.*"))); pipes.add(new TokenFirstPosition("FIRSTTOKEN")); pipes.add(new TokenSequence2FeatureVectorSequence()); pipes.add(new SequencePrintingPipe(out)); Pipe pipe = new SerialPipes(pipes); InstanceList trainingInstances = new InstanceList(pipe); trainingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true)); out.close(); }
InstanceList testingInstances = new InstanceList(pipe); trainingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true)); testingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(testingFilename)))), Pattern.compile("^\\s*$"), true));
InstanceList testingInstances = new InstanceList(pipe); trainingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true)); testingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(testingFilename)))), Pattern.compile("^\\s*$"), true));
InstanceList testingInstances = new InstanceList(pipe); trainingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true)); testingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(testingFilename)))), Pattern.compile("^\\s*$"), true));
public void testFixedNumLabels () throws IOException, ClassNotFoundException { Pipe p = new GenericAcrfData2TokenSequence (2); InstanceList training = new InstanceList (p); training.addThruPipe (new LineGroupIterator (new StringReader (sampleFixedData), Pattern.compile ("^$"), true)); assertEquals (1, training.size ()); Instance inst1 = training.get (0); LabelsSequence ls1 = (LabelsSequence) inst1.getTarget (); assertEquals (4, ls1.size ()); }
public void testFixedNumLabels () throws IOException, ClassNotFoundException { Pipe p = new GenericAcrfData2TokenSequence (2); InstanceList training = new InstanceList (p); training.addThruPipe (new LineGroupIterator (new StringReader (sampleFixedData), Pattern.compile ("^$"), true)); assertEquals (1, training.size ()); Instance inst1 = training.get (0); LabelsSequence ls1 = (LabelsSequence) inst1.getTarget (); assertEquals (4, ls1.size ()); }