@Override public AggregationBuffer getNewAggregationBuffer() throws HiveException { NGramAggBuf result = new NGramAggBuf(); result.nge = new NGramEstimator(); result.context = new ArrayList<String>(); reset(result); return result; }
private void processNgrams(NGramAggBuf agg, ArrayList<String> seq) throws HiveException { for(int i = seq.size()-agg.n; i >= 0; i--) { ArrayList<String> ngram = new ArrayList<String>(); for(int j = 0; j < agg.n; j++) { ngram.add(seq.get(i+j)); } agg.nge.add(ngram); } }
@Override public Object terminate(AggregationBuffer agg) throws HiveException { NGramAggBuf myagg = (NGramAggBuf) agg; return myagg.nge.getNGrams(); }
if(!myagg.nge.isInitialized()) { int n = PrimitiveObjectInspectorUtils.getInt(parameters[1], nOI); int k = PrimitiveObjectInspectorUtils.getInt(parameters[2], kOI); myagg.nge.initialize(k, pf, n);
trim(false);
@Override public void reset(AggregationBuffer agg) throws HiveException { NGramAggBuf result = (NGramAggBuf) agg; result.nge.reset(); result.n = 0; } }
@Override public Object terminatePartial(AggregationBuffer agg) throws HiveException { NGramAggBuf myagg = (NGramAggBuf) agg; ArrayList<Text> result = myagg.nge.serialize(); result.add(new Text(Integer.toString(myagg.n))); return result; }
@Override public void merge(AggregationBuffer agg, Object partial) throws HiveException { if(partial == null) { return; } NGramAggBuf myagg = (NGramAggBuf) agg; List partialNGrams = (List) loi.getList(partial); int n = Integer.parseInt(partialNGrams.get(partialNGrams.size()-1).toString()); // A value of 0 for n indicates that the mapper processed data that does not meet // filter criteria, so merge() should be NO-OP. if (n == 0) { return; } if(myagg.n > 0 && myagg.n != n) { throw new HiveException(getClass().getSimpleName() + ": mismatch in value for 'n'" + ", which usually is caused by a non-constant expression. Found '"+n+"' and '" + myagg.n + "'."); } myagg.n = n; partialNGrams.remove(partialNGrams.size()-1); myagg.nge.merge(partialNGrams); }
if(!myagg.nge.isInitialized()) { int k = PrimitiveObjectInspectorUtils.getInt(parameters[2], kOI); int pf = 0; myagg.nge.initialize(k, pf, contextNulls);
trim(true); if(ngrams.size() < 1) { // SQL standard - return null for zero elements return null;
@Override public void reset(AggregationBuffer agg) throws HiveException { NGramAggBuf result = (NGramAggBuf) agg; result.nge.reset(); result.n = 0; } }
@Override public Object terminatePartial(AggregationBuffer agg) throws HiveException { NGramAggBuf myagg = (NGramAggBuf) agg; ArrayList<Text> result = myagg.nge.serialize(); result.add(new Text(Integer.toString(myagg.n))); return result; }
@Override public void merge(AggregationBuffer agg, Object obj) throws HiveException { if(obj == null) { return; } NGramAggBuf myagg = (NGramAggBuf) agg; List partial = (List) loi.getList(obj); // remove the context words from the end of the list int contextSize = Integer.parseInt( partial.get(partial.size()-1).toString() ); partial.remove(partial.size()-1); if(myagg.context.size() > 0) { if(contextSize != myagg.context.size()) { throw new HiveException(getClass().getSimpleName() + ": found a mismatch in the" + " context string lengths. This is usually caused by passing a non-constant" + " expression for the context."); } } else { for(int i = partial.size()-contextSize; i < partial.size(); i++) { String word = partial.get(i).toString(); if(word.equals("")) { myagg.context.add( null ); } else { myagg.context.add( word ); } } partial.subList(partial.size()-contextSize, partial.size()).clear(); myagg.nge.merge(partial); } }
if(!myagg.nge.isInitialized()) { int n = PrimitiveObjectInspectorUtils.getInt(parameters[1], nOI); int k = PrimitiveObjectInspectorUtils.getInt(parameters[2], kOI); myagg.nge.initialize(k, pf, n);
trim(false);
private void processNgrams(NGramAggBuf agg, ArrayList<String> seq) throws HiveException { for(int i = seq.size()-agg.n; i >= 0; i--) { ArrayList<String> ngram = new ArrayList<String>(); for(int j = 0; j < agg.n; j++) { ngram.add(seq.get(i+j)); } agg.nge.add(ngram); } }
@Override public Object terminate(AggregationBuffer agg) throws HiveException { NGramAggBuf myagg = (NGramAggBuf) agg; return myagg.nge.getNGrams(); }
@Override public void reset(AggregationBuffer agg) throws HiveException { NGramAggBuf result = (NGramAggBuf) agg; result.context.clear(); result.nge.reset(); } }
@Override public AggregationBuffer getNewAggregationBuffer() throws HiveException { NGramAggBuf result = new NGramAggBuf(); result.nge = new NGramEstimator(); result.context = new ArrayList<String>(); reset(result); return result; }
@Override public Object terminatePartial(AggregationBuffer agg) throws HiveException { NGramAggBuf myagg = (NGramAggBuf) agg; ArrayList<Text> result = myagg.nge.serialize(); // push the context on to the end of the serialized n-gram estimation for(int i = 0; i < myagg.context.size(); i++) { if(myagg.context.get(i) == null) { result.add(new Text("")); } else { result.add(new Text(myagg.context.get(i))); } } result.add(new Text(Integer.toString(myagg.context.size()))); return result; }