@Override public boolean isNull(int fieldNum) throws ExecException { return t.isNull(fieldNum); }
protected boolean isKeyNull(Object key) throws ExecException { if (key == null) return true; if (key instanceof Tuple) { Tuple t = (Tuple)key; for (int i=0; i<t.size(); i++) { if (t.isNull(i)) return true; } } return false; }
protected int compareNull(boolean usNull, Tuple t, int pos) { boolean themNull; try { themNull = t.isNull(pos); } catch (ExecException e) { throw new RuntimeException("Unable to check if position " + pos + " is null in Tuple: " + t, e); } return compareNull(usNull, themNull); }
@Override public boolean isNull(int idx) throws ExecException { get(idx); return realTuple.isNull(idx); }
private boolean isAppendedFieldNull(int i) throws ExecException { return isAppendedFieldsNull() || appendedFields.isNull(i); }
public boolean hasNext() { if (tupleItr.hasNext()) { Tuple t = tupleItr.next(); try { if (!t.isNull(0) && !t.isNull(1)) { currentId = t.get(0).toString(); currentText = t.get(1).toString(); if (currentId.isEmpty() || currentText.isEmpty()) { return false; } else { return true; } } } catch (ExecException e) { throw new RuntimeException(e); } } return false; }
/** Returns the scalar inner product of this and the other term vector by multiplying each entry for the same term. <p> There are undoubtedly ways to optimize this. Please, enlighten me. @param other: Another term vector @return the dot product */ public Double dotProduct(TermVector other) throws ExecException { Double result = 0.0; for (Tuple x_i : this) { for (Tuple y_i : other) { if ( !(x_i.isNull(0) || x_i.isNull(1) || y_i.isNull(0) || y_i.isNull(1)) ) { if (x_i.get(0).toString().equals(y_i.get(0).toString())) { result += (Double)x_i.get(1)*(Double)y_i.get(1); } } } } return result; }
/** Map a tuple object into a map-writable object for elasticsearch. */ @SuppressWarnings("unchecked") @Override public void putNext(Tuple t) throws IOException { if (!t.isNull(0)) { MapWritable record = new MapWritable(); String jsonData = t.get(0).toString(); // parse json data and put into mapwritable record try { HashMap<String,Object> data = mapper.readValue(jsonData, HashMap.class); record = (MapWritable)toWritable(data); } catch (JsonParseException e) { e.printStackTrace(); } catch (JsonMappingException e) { e.printStackTrace(); } try { writer.write(NullWritable.get(), record); } catch (InterruptedException e) { throw new IOException(e); } } }
if (!t.isNull(0)) { String jsonData = t.get(0).toString();
public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() < 1 || input.isNull(0)) return null; // Output bag DataBag bagOfTokens = bagFactory.newDefaultBag(); StringReader textInput = new StringReader(input.get(0).toString()); PTBTokenizer ptbt = new PTBTokenizer(textInput, new CoreLabelTokenFactory(), ""); for (CoreLabel label; ptbt.hasNext(); ) { label = (CoreLabel)ptbt.next(); Tuple termText = tupleFactory.newTuple(label.toString()); bagOfTokens.add(termText); } return bagOfTokens; } }
/** Uses Lucene's StandardAnalyzer and tuns the tokens through several lucene filters - LengthFilter: Filter individual words to be of length > minWordSize - ShingleFilter: Converts word stream into n-gram stream - PatternReplaceFilter: Removes the 'filler' character that ShingleFilter puts in to replace stopwords */ public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() < 1 || input.isNull(0)) return null; TokenStream stream = analyzer.tokenStream(NOFIELD, input.get(0).toString()); LengthFilter filtered = new LengthFilter(Version.LUCENE_44, stream, minWordSize, Integer.MAX_VALUE); // Let words be long DataBag result; if (minGramSize == 1 && maxGramSize == 1) { result = fillBag(filtered); } else { ShingleFilter nGramStream = new ShingleFilter(filtered, minGramSize, maxGramSize); nGramStream.setOutputUnigrams(outputUnigrams); PatternReplaceFilter replacer = new PatternReplaceFilter(nGramStream, SHINGLE_FILLER, NOFIELD, true); result = fillBag(replacer); } return result; }
assert fields.size() == pigFields.size(); for (int i = 0; i < fields.size(); i++) { if (t.isNull(i)) { continue;