protected Collection<String> featuresCnC(PaddedList<IN> cInfo, int loc) { CoreLabel c = cInfo.get(loc); Collection<String> featuresCnC = new ArrayList<>(); if (flags.useNext) { if (flags.useSequences && flags.useNextSequences) { featuresCnC.add("NSEQ"); featuresCnC.add(getWord(c) + "-NSEQW"); } } return featuresCnC; }
protected Collection<String> featuresCpC(PaddedList<IN> cInfo, int loc) { Collection<String> features = new ArrayList<>(); CoreLabel c = cInfo.get(loc); CoreLabel p = cInfo.get(loc - 1); String charc = c.get(CoreAnnotations.CharAnnotation.class); String charp = p.get(CoreAnnotations.CharAnnotation.class); features.add(charc + charp + "-cngram"); // Indicator transition feature features.add("cliqueCpC"); return features; }
protected Collection<String> featuresCpCnC(PaddedList<IN> cInfo, int loc) { CoreLabel c = cInfo.get(loc); Collection<String> featuresCpCnC = new ArrayList<>(); if (flags.useNext && flags.usePrev) { if (flags.useSequences && flags.usePrevSequences && flags.useNextSequences) { featuresCpCnC.add("PNSEQ"); featuresCpCnC.add(getWord(c) + "-PNSEQW"); } } return featuresCpCnC; }
protected Collection<String> featuresCp2C(PaddedList<IN> cInfo, int loc) { Collection<String> features = new ArrayList<>(); CoreLabel c = cInfo.get(loc); CoreLabel p = cInfo.get(loc - 1); CoreLabel p2 = cInfo.get(loc - 2); String charc = c.get(CoreAnnotations.CharAnnotation.class); String charp = p.get(CoreAnnotations.CharAnnotation.class); String charp2 = p2.get(CoreAnnotations.CharAnnotation.class); features.add(charc + charp + charp2 + "-cngram"); // Indicator transition feature features.add("cliqueCp2C"); return features; }
protected Collection<String> featuresCpCp2Cp3Cp4C(PaddedList<IN> cInfo, int loc) { Collection<String> featuresCpCp2Cp3Cp4C = new ArrayList<>(); CoreLabel p = cInfo.get(loc - 1); if (flags.maxLeft >= 4) { if (flags.useLongSequences) { featuresCpCp2Cp3Cp4C.add("PPPPSEQ"); } if (flags.useBoundarySequences && getWord(p).equals(CoNLLDocumentReaderAndWriter.BOUNDARY)) { featuresCpCp2Cp3Cp4C.add("BNDRY-SPAN-PPPPSEQ"); } } return featuresCpCp2Cp3Cp4C; }
@Override public Collection<String> getCliqueFeatures (PaddedList<IN> info, int position, Clique clique) { List<String> features = new ArrayList<>(Arrays.asList(info.get(position).word().split(" "))); return features; }
protected Collection<String> featuresCp3C(PaddedList<IN> cInfo, int loc) { Collection<String> features = new ArrayList<>(); CoreLabel c = cInfo.get(loc); CoreLabel p = cInfo.get(loc - 1); CoreLabel p2 = cInfo.get(loc - 2); CoreLabel p3 = cInfo.get(loc - 3); String charc = c.get(CoreAnnotations.CharAnnotation.class); String charp = p.get(CoreAnnotations.CharAnnotation.class); String charp2 = p2.get(CoreAnnotations.CharAnnotation.class); String charp3 = p3.get(CoreAnnotations.CharAnnotation.class); features.add(charc + charp + charp2 + charp3 + "-cngram"); // Indicator transition feature features.add("cliqueCp3C"); return features; } }
protected Collection<String> featuresC(PaddedList<IN> cInfo, int loc) { Collection<String> features = super.featuresC(cInfo, loc); CoreLabel n3 = cInfo.get(loc + 3); CoreLabel p3 = cInfo.get(loc - 3); String charn3 = n3.get(CoreAnnotations.CharAnnotation.class); String charp3 = p3.get(CoreAnnotations.CharAnnotation.class); // a 7 character window instead of a 5 character window features.add(charn3 + "-n3"); features.add(charp3 + "-p3"); return features; } }
public Collection<String> featuresCnC(PaddedList<IN> cInfo, int loc) { Collection<String> features = new ArrayList<>(); CoreLabel c = cInfo.get(loc); CoreLabel c1 = cInfo.get(loc + 1); CoreLabel p = cInfo.get(loc - 1); String charc = c.get(CoreAnnotations.CharAnnotation.class); String charc1 = c1.get(CoreAnnotations.CharAnnotation.class); String charp = p.get(CoreAnnotations.CharAnnotation.class); if (flags.useWordn) { features.add(charc +"c"); features.add(charc1+"c1"); features.add(charp +"p"); features.add(charp +charc +"pc"); if(flags.useAs || flags.useMsr||flags.usePk||flags.useHk){ features.add(charc +charc1 +"cc1"); features.add(charp + charc1 +"pc1"); } features.add("|wordn"); } return features; }//end of CnC
protected Collection<String> featuresCp3C(PaddedList<IN> cInfo, int loc) { CoreLabel c = cInfo.get(loc); CoreLabel p = cInfo.get(loc - 1); CoreLabel p2 = cInfo.get(loc - 2); CoreLabel p3 = cInfo.get(loc - 3); String cWord = getWord(c); String pWord = getWord(p); String p2Word = getWord(p2); String p3Word = getWord(p3); Collection<String> featuresCp3C = new ArrayList<>(); if (flags.useParenMatching) { if (flags.useReverse) { if (cWord.equals("(") || cWord.equals("[")) { if ((flags.maxLeft >= 3) && (p3Word.equals(")") || p3Word.equals("]")) && !(p2Word.equals(")") || p2Word.equals("]") || pWord.equals(")") || pWord.equals("]"))) { featuresCp3C.add("PAREN-MATCH"); } } } else { if (cWord.equals(")") || cWord.equals("]")) { if ((flags.maxLeft >= 3) && (p3Word.equals("(") || p3Word.equals("[")) && !(p2Word.equals("(") || p2Word.equals("[") || pWord.equals("(") || pWord.equals("["))) { featuresCp3C.add("PAREN-MATCH"); } } } } return featuresCp3C; }
protected Collection<String> featuresCp4C(PaddedList<IN> cInfo, int loc) { CoreLabel c = cInfo.get(loc); CoreLabel p = cInfo.get(loc - 1); CoreLabel p2 = cInfo.get(loc - 2); CoreLabel p3 = cInfo.get(loc - 3); CoreLabel p4 = cInfo.get(loc - 4); String cWord = getWord(c); String pWord = getWord(p); String p2Word = getWord(p2); String p3Word = getWord(p3); String p4Word = getWord(p4); Collection<String> featuresCp4C = new ArrayList<>(); if (flags.useParenMatching) { if (flags.useReverse) { if (cWord.equals("(") || cWord.equals("[")) { if ((flags.maxLeft >= 4) && (p4Word.equals(")") || p4Word.equals("]")) && !(p3Word.equals(")") || p3Word.equals("]") || p2Word.equals(")") || p2Word.equals("]") || pWord.equals(")") || pWord.equals("]"))) { featuresCp4C.add("PAREN-MATCH"); } } } else { if (cWord.equals(")") || cWord.equals("]")) { if ((flags.maxLeft >= 4) && (p4Word.equals("(") || p4Word.equals("[")) && !(p3Word.equals("(") || p3Word.equals("[") || p2Word.equals("(") || p2Word.equals("[") || pWord.equals("(") || pWord.equals("["))) { featuresCp4C.add("PAREN-MATCH"); } } } } return featuresCp4C; }
Collection<String> features = new ArrayList<>(); if (flags.useWordn) { CoreLabel c = cInfo.get(loc); CoreLabel c2 = cInfo.get(loc + 1); CoreLabel p = cInfo.get(loc - 1); CoreLabel p2 = cInfo.get(loc - 2); String charc = c.getString(CoreAnnotations.CharAnnotation.class); String charc2 = c2.getString(CoreAnnotations.CharAnnotation.class);
protected Collection<String> featuresCpCp2Cp3C(PaddedList<? extends CoreLabel> cInfo, int loc) { Collection<String> features = new ArrayList<>(); if (flags.use4Clique && flags.maxLeft >= 3) { CoreLabel c = cInfo.get(loc); CoreLabel c2 = cInfo.get(loc + 1); CoreLabel p = cInfo.get(loc - 1); CoreLabel p2 = cInfo.get(loc - 2); CoreLabel p3 = cInfo.get(loc - 3); String charc = c.getString(CoreAnnotations.CharAnnotation.class); String charp = p.getString(CoreAnnotations.CharAnnotation.class);
@Override protected Collection<String> featuresCpC(PaddedList<IN> cInfo, int loc) { Collection<String> features = super.featuresCpC(cInfo, loc); CoreLabel c = cInfo.get(loc); // "Wrapper" feature: identity of first and last two chars of the current word. // This helps detect ma+_+sh in dialect, as well as avoiding segmenting possessive // pronouns if the word starts with al-. if (c.word().length() > 3) { String start = c.word().substring(0, 2); String end = c.word().substring(c.word().length() - 2); if (c.index() == 2) { features.add(start + "_" + end + "-begin-wrap"); } if (c.index() == c.word().length() - 1) { features.add(start + "_" + end + "-end-wrap"); } } return features; } }
@Override public CRFDatum<List<String>, CRFLabel> makeDatum(List<IN> info, int loc, List<FeatureFactory<IN>> featureFactories) { pad.set(CoreAnnotations.AnswerAnnotation.class, flags.backgroundSymbol); PaddedList<IN> pInfo = new PaddedList<>(info, pad); List<List<String>> features = new ArrayList<>(); Collection<Clique> done = Generics.newHashSet(); for (int i = 0; i < windowSize; i++) { List<String> featuresC = new ArrayList<>(); List<Clique> windowCliques = FeatureFactory.getCliques(i, 0); windowCliques.removeAll(done); done.addAll(windowCliques); for (Clique c : windowCliques) { for (FeatureFactory<IN> featureFactory : featureFactories) { featuresC.addAll(featureFactory.getCliqueFeatures(pInfo, loc, c)); } } if (testTime && i==0) { // this feature is only present at test time and only appears // in cliques of size 1 (i.e., cliques with window=0) featuresC.add(BIAS); } features.add(featuresC); } int[] labels = new int[windowSize]; for (int i = 0; i < windowSize; i++) { String answer = pInfo.get(loc + i - windowSize + 1).get(CoreAnnotations.AnswerAnnotation.class); labels[i] = classIndex.indexOf(answer); } return new CRFDatum<>(features, new CRFLabel(labels), null); }
/** Make an individual Datum out of the data list info, focused at position loc. * * @param info A List of IN objects * @param loc The position in the info list to focus feature creation on * @param featureFactories The factory that constructs features out of the item * @return A Datum (BasicDatum) representing this data instance */ public Datum<String, String> makeDatum(List<IN> info, int loc, List<FeatureFactory<IN>> featureFactories) { PaddedList<IN> pInfo = new PaddedList<>(info, pad); Collection<String> features = new ArrayList<>(); for (FeatureFactory<IN> featureFactory : featureFactories) { List<Clique> cliques = featureFactory.getCliques(); for (Clique c : cliques) { Collection<String> feats = featureFactory.getCliqueFeatures(pInfo, loc, c); feats = addOtherClasses(feats, pInfo, loc, c); features.addAll(feats); } } printFeatures(pInfo.get(loc), features); CoreLabel c = info.get(loc); return new BasicDatum<>(features, c.get(CoreAnnotations.AnswerAnnotation.class)); }
/** * Extracts all the features from the input data at a certain index. * * @param cInfo The complete data set as a List of WordInfo * @param loc The index at which to extract features. */ public Collection<String> getCliqueFeatures(PaddedList<IN> cInfo, int loc, Clique clique) { Collection<String> features = Generics.newHashSet(); if (clique == cliqueC) { addAllInterningAndSuffixing(features, featuresC(cInfo, loc), "C"); } else if (clique == cliqueCpC) { addAllInterningAndSuffixing(features, featuresCpC(cInfo, loc), "CpC"); } else if (clique == cliqueCp2C) { addAllInterningAndSuffixing(features, featuresCp2C(cInfo, loc), "Cp2C"); } else if (clique == cliqueCp3C) { addAllInterningAndSuffixing(features, featuresCp3C(cInfo, loc), "Cp3C"); } String domain = cInfo.get(loc).get(CoreAnnotations.DomainAnnotation.class); if (domain != null) { Collection<String> domainFeatures = Generics.newHashSet(); for (String feature : features) { domainFeatures.add(feature + DOMAIN_MARKER + domain); } features.addAll(domainFeatures); } return features; }