@Override public List<Feature> extract(JCas jcas) // TODO: not adapted for focus annotations { List<Feature> featList = new ArrayList<Feature>(); List<String> tokens = JCasUtil.toText(JCasUtil.select(jcas, Token.class)); int nrOfTokens = tokens.size(); Pattern p = Pattern.compile("^[a-zA-Z0-9]*[0-9]+[a-zA-Z0-9]*$"); int pmatches = 0; for (String t : tokens) { Matcher m = p.matcher(t); if (m.find()) { pmatches++; System.out.println(t + " matches Words With Numbers"); } } featList.add(new Feature(FEATURE_NAME, (double) pmatches / nrOfTokens)); return featList; } }
@Override public List<Feature> extract(JCas jcas) { List<Feature> featList = new ArrayList<Feature>(); List<String> tokens = JCasUtil.toText(JCasUtil.select(jcas, Token.class)); int nrOfTokens = tokens.size(); Pattern p = Pattern.compile("^[a-zA-Z0-9]*[0-9]+[a-zA-Z0-9]*$"); int pmatches = 0; for (String t : tokens) { Matcher m = p.matcher(t); if (m.find()) { pmatches++; System.out.println(t + " matches Words With Numbers"); } } featList.add(new Feature(FEATURE_NAME, (double) pmatches / nrOfTokens)); return featList; } }
@Override public List<Feature> extract(JCas jcas) throws TextClassificationException { if (topicFilePath == null || topicFilePath.isEmpty()) { System.out.println("Path to word list must be set!"); } List<String> topics = null; List<Feature> featList = new ArrayList<Feature>(); List<String> tokens = JCasUtil.toText(JCasUtil.select(jcas, Token.class)); try { topics = FileUtils.readLines(new File(topicFilePath)); for (String t : topics) { featList.addAll(countWordHits(t, tokens)); } } catch (IOException e) { e.printStackTrace(); } return featList; }
@Override public List<Feature> extract(JCas jcas) // TODO: not adapted for focus annotations throws TextClassificationException { if (topicFilePath == null || topicFilePath.isEmpty()) { System.out.println("Path to word list must be set!"); } List<String> topics = null; List<Feature> featList = new ArrayList<Feature>(); List<String> tokens = JCasUtil.toText(JCasUtil.select(jcas, Token.class)); try { topics = FileUtils.readLines(new File(topicFilePath)); for (String t : topics) { featList.addAll(countWordHits(t, tokens)); } } catch (IOException e) { e.printStackTrace(); } return featList; }
public static void assertToken(String[] aExpected, Collection<Token> aActual) { if (aExpected == null) { return; } List<String> expected = asList(aExpected); List<String> actual = toText(aActual); System.out.printf("%-20s - Expected: %s%n", "Tokens", asCopyableString(expected)); System.out.printf("%-20s - Actual : %s%n", "Tokens", asCopyableString(actual)); assertEquals(asCopyableString(expected, true), asCopyableString(actual, true)); }
public static void assertCoreference(String[][] aExpected, Collection<CoreferenceChain> aActual) { List<CoreferenceChain> actual = new ArrayList<CoreferenceChain>(aActual); for (String[] i : aExpected) { System.out.printf("%-20s - Expected: %s%n", "Coreference", asCopyableString(asList(i))); } for (CoreferenceChain i : actual) { System.out.printf("%-20s - Actual : %s%n", "Coreference", asCopyableString(toText(i.links()))); } if (aExpected.length == aActual.size()) { for (int i = 0; i < actual.size(); i++) { assertEquals(asCopyableString(asList(aExpected[i]), true), asCopyableString(toText(actual.get(i).links()), true)); } } else { fail("Expected [" + aExpected.length + "] chains but found " + aActual.size() + "]"); } }
public static void assertCoreference(String[][] aExpected, Collection<CoreferenceChain> aActual) { List<CoreferenceChain> actual = new ArrayList<CoreferenceChain>(aActual); for (String[] i : aExpected) { System.out.printf("%-20s - Expected: %s%n", "Coreference", asCopyableString(asList(i))); } for (CoreferenceChain i : actual) { System.out.printf("%-20s - Actual : %s%n", "Coreference", asCopyableString(toText(i.links()))); } if (aExpected.length == aActual.size()) { for (int i = 0; i < actual.size(); i++) { assertEquals(asCopyableString(asList(aExpected[i]), true), asCopyableString(toText(actual.get(i).links()), true)); } } else { fail("Expected [" + aExpected.length + "] chains but found " + aActual.size() + "]"); } }
public static void assertToken(String[] aExpected, Collection<Token> aActual) { if (aExpected == null) { return; } List<String> expected = asList(aExpected); List<String> actual = toText(aActual); System.out.printf("%-20s - Expected: %s%n", "Tokens", asCopyableString(expected)); System.out.printf("%-20s - Actual : %s%n", "Tokens", asCopyableString(actual)); assertEquals(asCopyableString(expected, true), asCopyableString(actual, true)); }
public static void assertSentence(String[] aExpected, Collection<Sentence> aActual) { if (aExpected == null) { return; } List<String> expected = asList(aExpected); List<String> actual = toText(aActual); System.out.printf("%-20s - Expected: %s%n", "Sentences", asCopyableString(expected)); System.out.printf("%-20s - Actual : %s%n", "Sentences", asCopyableString(actual)); assertEquals(asCopyableString(expected, true), asCopyableString(actual, true)); }
public static void assertSentence(String[] aExpected, Collection<Sentence> aActual) { if (aExpected == null) { return; } List<String> expected = asList(aExpected); List<String> actual = toText(aActual); System.out.printf("%-20s - Expected: %s%n", "Sentences", asCopyableString(expected)); System.out.printf("%-20s - Actual : %s%n", "Sentences", asCopyableString(actual)); assertEquals(asCopyableString(expected, true), asCopyableString(actual, true)); }
@Override public Set<Feature> extract(JCas jcas, TextClassificationTarget aTarget) throws TextClassificationException { if (topicFilePath == null || topicFilePath.isEmpty()) { throw new TextClassificationException("Path to word list must be set!"); } List<String> topics = null; Set<Feature> features = new HashSet<Feature>(); List<String> tokens = JCasUtil.toText(JCasUtil.selectCovered(jcas, Token.class, aTarget)); try { topics = FileUtils.readLines(new File(topicFilePath), "utf-8"); for (String t : topics) { features.addAll(countWordHits(t, tokens)); } } catch (IOException e) { throw new TextClassificationException(e); } return features; }
@Override public Set<Feature> extract(JCas jcas, TextClassificationTarget aTarget) throws TextClassificationException { if (topicFilePath == null || topicFilePath.isEmpty()) { throw new TextClassificationException("Path to word list must be set!"); } List<String> topics = null; Set<Feature> features = new HashSet<Feature>(); List<String> tokens = JCasUtil.toText(JCasUtil.selectCovered(jcas, Token.class, aTarget)); try { topics = FileUtils.readLines(new File(topicFilePath), "utf-8"); for (String t : topics) { features.addAll(countWordHits(t, tokens)); } } catch (IOException e) { throw new TextClassificationException(e); } return features; }
public static FrequencyDistribution<String> getDocumentSkipNgrams( JCas jcas, boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN, int skipN, Set<String> stopwords) { FrequencyDistribution<String> documentNgrams = new FrequencyDistribution<String>(); for (Sentence s : select(jcas, Sentence.class)) { for (List<String> ngram : new SkipNgramStringListIterable( toText(selectCovered(Token.class, s)), minN, maxN, skipN)) { if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) { String ngramString = StringUtils.join(ngram, NGRAM_GLUE); if (lowerCaseNGrams) { ngramString = ngramString.toLowerCase(); } documentNgrams.inc(ngramString); } } } return documentNgrams; } }
public static FrequencyDistribution<String> getDocumentNgrams( JCas jcas, boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN, Set<String> stopwords) { FrequencyDistribution<String> documentNgrams = new FrequencyDistribution<String>(); for (Sentence s : select(jcas, Sentence.class)) { // TODO parameterize type for (List<String> ngram : new NGramStringListIterable(toText(selectCovered(Token.class, s)), minN, maxN)) { if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) { String ngramString = StringUtils.join(ngram, NGRAM_GLUE); if (lowerCaseNGrams) { ngramString = ngramString.toLowerCase(); } documentNgrams.inc(ngramString); } } } return documentNgrams; }
@Override public void process(JCas jCas) throws AnalysisEngineProcessException { for (Annotation window : JCasUtil.select(jCas, this.windowClass)) { List<TOKEN_TYPE> tokens = this.tokenOps.selectTokens(jCas, window); if (tokens.size() <= 0) { return; } List<String> tokenStrings = JCasUtil.toText(tokens); // As of version 1.3.0, ClearNLP does all processing through its own dependency tree // structure DEPTree clearNlpDepTree = new DEPTree(tokenStrings); this.tagger.process(clearNlpDepTree); // Note the ClearNLP counts index 0 as the sentence dependency node, so the POS tag indices // are shifted by one from the token indices for (int i = 0; i < tokens.size(); i++) { TOKEN_TYPE token = tokens.get(i); DEPNode node = clearNlpDepTree.get(i+1); this.tokenOps.setPos(jCas, token, node.getPOSTag()); } } }
public static FrequencyDistribution<String> getDocumentSkipNgrams(JCas jcas, boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN, int skipN, Set<String> stopwords) { FrequencyDistribution<String> documentNgrams = new FrequencyDistribution<String>(); for (Sentence s : select(jcas, Sentence.class)) { for (List<String> ngram : new SkipNgramStringListIterable(toText(selectCovered( Token.class, s)), minN, maxN, skipN)) { if (lowerCaseNGrams) { ngram = lower(ngram); } if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) { String ngramString = StringUtils.join(ngram, NGRAM_GLUE); documentNgrams.inc(ngramString); } } } return documentNgrams; }
forms.addAll(JCasUtil.toText(tokens));
public static FrequencyDistribution<String> getDocumentSkipNgrams(JCas jcas, Annotation anno, boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN, int skipN, Set<String> stopwords) { FrequencyDistribution<String> documentNgrams = new FrequencyDistribution<String>(); for (Sentence s : selectCovered(jcas, Sentence.class, anno)) { for (List<String> ngram : new SkipNgramStringListIterable( toText(selectCovered(Token.class, s)), minN, maxN, skipN)) { if (lowerCaseNGrams) { ngram = lower(ngram); } if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) { String ngramString = StringUtils.join(ngram, NGRAM_GLUE); documentNgrams.inc(ngramString); } } } return documentNgrams; }
@Override public void process(JCas jCas) throws AnalysisEngineProcessException { for (Annotation window : JCasUtil.select(jCas, this.windowClass)) { List<TOKEN_TYPE> tokens = this.tokenOps.selectTokens(jCas, window); List<String> tokenStrings = JCasUtil.toText(tokens); // All processing in ClearNLP goes through the DEPTree structures, // so populate it with token and POS tag info DEPTree depTree = new DEPTree(tokenStrings); for (int i = 1; i < depTree.size(); i++) { TOKEN_TYPE token = tokens.get(i - 1); DEPNode node = depTree.get(i); node.setPOSTag(this.tokenOps.getPos(jCas, token)); } // Run the morphological analyzer this.mpAnalyzer.process(depTree); // Pull out lemmas and stuff them back into the CAS tokens for (int i = 1; i < depTree.size(); i++) { TOKEN_TYPE token = tokens.get(i - 1); DEPNode node = depTree.get(i); this.tokenOps.setLemma(jCas, token, node.getLemma()); } } }
List<String> tokenStrings = JCasUtil.toText(tokens); for (SpellingAnomaly anomaly : JCasUtil.selectCovered(jcas, SpellingAnomaly.class, sentence)) {