/** * Encode the terms to match against in the iterator. Same as calling * {@link #setRegexs(IteratorSetting, String, String, String, String, boolean, boolean)} with * matchSubstring set to false * * @param si * ScanIterator config to be updated * @param rowTerm * the pattern to match against the Key's row. Not used if null. * @param cfTerm * the pattern to match against the Key's column family. Not used if null. * @param cqTerm * the pattern to match against the Key's column qualifier. Not used if null. * @param valueTerm * the pattern to match against the Key's value. Not used if null. * @param orFields * if true, any of the non-null terms can match to return the entry */ public static void setRegexs(IteratorSetting si, String rowTerm, String cfTerm, String cqTerm, String valueTerm, boolean orFields) { setRegexs(si, rowTerm, cfTerm, cqTerm, valueTerm, orFields, false); }
@Override protected void setUpIterator(final int prio, final String name, final String term, final BatchScanner scanner, CommandLine cl, boolean negate) throws IOException { if (prio < 0) { throw new IllegalArgumentException("Priority < 0 " + prio); } final IteratorSetting si = new IteratorSetting(prio, name, RegExFilter.class); RegExFilter.setRegexs(si, term, term, term, term, true, cl.hasOption(matchSubstringOption.getOpt())); RegExFilter.setNegate(si, negate); scanner.addScanIterator(si); }
@Override public SortedKeyValueIterator<Key,Value> deepCopy(IteratorEnvironment env) { RegExFilter result = (RegExFilter) super.deepCopy(env); result.rowMatcher = copyMatcher(rowMatcher); result.colfMatcher = copyMatcher(colfMatcher); result.colqMatcher = copyMatcher(colqMatcher); result.valueMatcher = copyMatcher(valueMatcher); result.orFields = orFields; return result; }
@Override public boolean accept(Key key, Value value) { if (orFields) return ((matches(rowMatcher, rowMatcher == null ? null : key.getRowData())) || (matches(colfMatcher, colfMatcher == null ? null : key.getColumnFamilyData())) || (matches(colqMatcher, colqMatcher == null ? null : key.getColumnQualifierData())) || (matches(valueMatcher, value.get(), 0, value.get().length))); return ((matches(rowMatcher, rowMatcher == null ? null : key.getRowData())) && (matches(colfMatcher, colfMatcher == null ? null : key.getColumnFamilyData())) && (matches(colqMatcher, colqMatcher == null ? null : key.getColumnQualifierData())) && (matches(valueMatcher, value.get(), 0, value.get().length))); }
@Override public boolean accept(Key key, Value value) { if (orFields) return ((matches(rowMatcher, rowMatcher == null ? null : key.getRowData())) || (matches(colfMatcher, colfMatcher == null ? null : key.getColumnFamilyData())) || (matches(colqMatcher, colqMatcher == null ? null : key.getColumnQualifierData())) || (matches(valueMatcher, value.get(), 0, value.get().length))); return ((matches(rowMatcher, rowMatcher == null ? null : key.getRowData())) && (matches(colfMatcher, colfMatcher == null ? null : key.getColumnFamilyData())) && (matches(colqMatcher, colqMatcher == null ? null : key.getColumnQualifierData())) && (matches(valueMatcher, value.get(), 0, value.get().length))); }
/** * Encode the terms to match against in the iterator. Same as calling * {@link #setRegexs(IteratorSetting, String, String, String, String, boolean, boolean)} with * matchSubstring set to false * * @param si * ScanIterator config to be updated * @param rowTerm * the pattern to match against the Key's row. Not used if null. * @param cfTerm * the pattern to match against the Key's column family. Not used if null. * @param cqTerm * the pattern to match against the Key's column qualifier. Not used if null. * @param valueTerm * the pattern to match against the Key's value. Not used if null. * @param orFields * if true, any of the non-null terms can match to return the entry */ public static void setRegexs(IteratorSetting si, String rowTerm, String cfTerm, String cqTerm, String valueTerm, boolean orFields) { setRegexs(si, rowTerm, cfTerm, cqTerm, valueTerm, orFields, false); }
@Override public int run(String[] strings) throws Exception { conf.set(MRUtils.JOB_NAME_PROP, "Upgrade to Rya 3.2.2"); //faster init(); Job job = new Job(conf); job.setJarByClass(Upgrade322Tool.class); setupAccumuloInput(job); AccumuloInputFormat.setInputTableName(job, MRUtils.getTablePrefix(conf) + TBL_OSP_SUFFIX); //we do not need to change any row that is a string, custom, or uri type IteratorSetting regex = new IteratorSetting(30, "regex", RegExFilter.class); RegExFilter.setRegexs(regex, "\\w*" + TYPE_DELIM + "[\u0003|\u0008|\u0002]", null, null, null, false); RegExFilter.setNegate(regex, true); // set input output of the particular job job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Mutation.class); setupAccumuloOutput(job, MRUtils.getTablePrefix(conf) + TBL_SPO_SUFFIX); // set mapper and reducer classes job.setMapperClass(Upgrade322Mapper.class); job.setReducerClass(Reducer.class); // Submit the job return job.waitForCompletion(true) ? 0 : 1; }
@Override public SortedKeyValueIterator<Key,Value> deepCopy(IteratorEnvironment env) { RegExFilter result = (RegExFilter) super.deepCopy(env); result.rowMatcher = copyMatcher(rowMatcher); result.colfMatcher = copyMatcher(colfMatcher); result.colqMatcher = copyMatcher(colqMatcher); result.valueMatcher = copyMatcher(valueMatcher); result.orFields = orFields; return result; }
@Override protected void setUpIterator(final int prio, final String name, final String term, final BatchScanner scanner, CommandLine cl) throws IOException { if (prio < 0) { throw new IllegalArgumentException("Priority < 0 " + prio); } final IteratorSetting si = new IteratorSetting(prio, name, RegExFilter.class); RegExFilter.setRegexs(si, term, term, term, term, true, cl.hasOption(matchSubstringOption.getOpt())); scanner.addScanIterator(si); }
@Override public int run(String[] strings) throws Exception { conf.set(MRUtils.JOB_NAME_PROP, "Upgrade to Rya 3.2.2"); //faster init(); Job job = new Job(conf); job.setJarByClass(Upgrade322Tool.class); setupAccumuloInput(job); AccumuloInputFormat.setInputTableName(job, MRUtils.getTablePrefix(conf) + TBL_OSP_SUFFIX); //we do not need to change any row that is a string, custom, or iri type IteratorSetting regex = new IteratorSetting(30, "regex", RegExFilter.class); RegExFilter.setRegexs(regex, "\\w*" + TYPE_DELIM + "[\u0003|\u0008|\u0002]", null, null, null, false); RegExFilter.setNegate(regex, true); // set input output of the particular job job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Mutation.class); setupAccumuloOutput(job, MRUtils.getTablePrefix(conf) + TBL_SPO_SUFFIX); // set mapper and reducer classes job.setMapperClass(Upgrade322Mapper.class); job.setReducerClass(Reducer.class); // Submit the job return job.waitForCompletion(true) ? 0 : 1; }
/** * Add custom iterator to the given scanner so that * it will only return keys with value corresponding to an edge. * @param scan * @param labels */ protected void applyEdgeLabelValueFilter(Scanner scan, String... labels) { StringBuilder regex = new StringBuilder(); for (String lab : labels) { if (regex.length() != 0) regex.append("|"); regex.append(".*"+Constants.ID_DELIM+"\\Q").append(lab).append("\\E$"); } IteratorSetting is = new IteratorSetting(10, "edgeValueFilter", RegExFilter.class); RegExFilter.setRegexs(is, null, null, null, regex.toString(), false); scan.addScanIterator(is); }
/** * Add custom iterator to the given scanner so that * it will only return keys with value corresponding to an edge. * @param scan * @param labels */ protected void applyEdgeLabelValueFilter(Scanner scan, String... labels) { StringBuilder regex = new StringBuilder(); for (String lab : labels) { if (regex.length() != 0) regex.append("|"); regex.append(".*"+Constants.ID_DELIM+"\\Q").append(lab).append("\\E$"); } IteratorSetting is = new IteratorSetting(10, "edgeValueFilter", RegExFilter.class); RegExFilter.setRegexs(is, null, null, null, regex.toString(), false); scan.addScanIterator(is); }
/** * Creates a {@link RegExFilter} setting to ignore the version row in a table. * @return the {@link RegExFilter} {@link IteratorSetting}. */ public static IteratorSetting getVersionRegExFilterSetting() { final IteratorSetting regex = new IteratorSetting(30, "version_regex", RegExFilter.class); RegExFilter.setRegexs(regex, "(.*)urn:(.*)#version[\u0000|\u0001](.*)", null, null, null, false); Filter.setNegate(regex, true); return regex; }
/** * Creates a {@link RegExFilter} setting to ignore the version row in a table. * @return the {@link RegExFilter} {@link IteratorSetting}. */ public static IteratorSetting getVersionRegExFilterSetting() { final IteratorSetting regex = new IteratorSetting(30, "version_regex", RegExFilter.class); RegExFilter.setRegexs(regex, "(.*)urn:(.*)#version[\u0000|\u0001](.*)", null, null, null, false); Filter.setNegate(regex, true); return regex; }
/** * Creates a {@link RegExFilter} setting to ignore the version row in a table. * @return the {@link RegExFilter} {@link IteratorSetting}. */ public static IteratorSetting getVersionRegExFilterSetting() { final IteratorSetting regex = new IteratorSetting(30, "version_regex", RegExFilter.class); RegExFilter.setRegexs(regex, "(.*)urn:(.*)#version[\u0000|\u0001](.*)", null, null, null, false); Filter.setNegate(regex, true); return regex; }
/** * Creates a {@link RegExFilter} setting to ignore the copy tool run time row in a table. * @return the {@link RegExFilter} {@link IteratorSetting}. */ public static IteratorSetting getCopyToolRunTimeRegExFilterSetting() { final IteratorSetting regex = new IteratorSetting(31, COPY_TOOL_RUN_TIME_LOCAL_NAME + "_regex", RegExFilter.class); RegExFilter.setRegexs(regex, "(.*)urn:(.*)#" + COPY_TOOL_RUN_TIME_LOCAL_NAME + "[\u0000|\u0001](.*)", null, null, null, false); Filter.setNegate(regex, true); return regex; }
/** * Creates a {@link RegExFilter} setting to ignore the copy tool time setting row in a table. * @return the {@link RegExFilter} {@link IteratorSetting}. */ public static IteratorSetting getCopyToolTimeOffsetRegExFilterSetting() { final IteratorSetting regex = new IteratorSetting(33, COPY_TOOL_TIME_OFFSET_LOCAL_NAME + "_regex", RegExFilter.class); RegExFilter.setRegexs(regex, "(.*)urn:(.*)#" + COPY_TOOL_TIME_OFFSET_LOCAL_NAME + "[\u0000|\u0001](.*)", null, null, null, false); Filter.setNegate(regex, true); return regex; }
/** * Creates a {@link RegExFilter} setting to ignore the copy tool split time row in a table. * @return the {@link RegExFilter} {@link IteratorSetting}. */ public static IteratorSetting getCopyToolSplitTimeRegExFilterSetting() { final IteratorSetting regex = new IteratorSetting(32, COPY_TOOL_SPLIT_TIME_LOCAL_NAME + "_regex", RegExFilter.class); RegExFilter.setRegexs(regex, "(.*)urn:(.*)#" + COPY_TOOL_SPLIT_TIME_LOCAL_NAME + "[\u0000|\u0001](.*)", null, null, null, false); Filter.setNegate(regex, true); return regex; }
public int run(String[] args) throws Exception { Job job = new Job(getConf(), this.getClass().getSimpleName()); job.setJarByClass(this.getClass()); job.setInputFormatClass(AccumuloInputFormat.class); AccumuloInputFormat.setZooKeeperInstance(job.getConfiguration(), args[0], args[1]); AccumuloInputFormat.setInputInfo(job.getConfiguration(), args[2], args[3].getBytes(), args[4], new Authorizations()); IteratorSetting regex = new IteratorSetting(50, "regex", RegExFilter.class); RegExFilter.setRegexs(regex, args[5], args[6], args[7], args[8], false); AccumuloInputFormat.addIterator(job.getConfiguration(), regex); job.setMapperClass(RegexMapper.class); job.setMapOutputKeyClass(Key.class); job.setMapOutputValueClass(Value.class); job.setNumReduceTasks(0); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, new Path(args[9])); System.out.println("setRowRegex: " + args[5]); System.out.println("setColumnFamilyRegex: " + args[6]); System.out.println("setColumnQualifierRegex: " + args[7]); System.out.println("setValueRegex: " + args[8]); job.waitForCompletion(true); return job.isSuccessful() ? 0 : 1; }