/** * Reads a single news article, and writes its contents to a new fiji row, * indexed by the article's name (A string consisting of the parent folder, and * this article's hash), and the a priori categorization of this article. * * @param key The fully qualified path to the current file we're reading. * @param value The raw data to insert into this column. * @param context The context to write to. * @throws IOException if there is an error. */ @Override public void produce(Text key, Text value, FijiTableContext context) throws IOException { Path qualifiedPath = new Path(key.toString()); // Category is specified on the containing folder. String category = qualifiedPath.getParent().getName(); // Name is the concatenation of category and file name. String name = category + "." + qualifiedPath.getName(); // write name, category, and raw article. EntityId entity = context.getEntityId(name); context.put(entity, FAMILY, ARTICLE_NAME_QUALIFIER, name); context.put(entity, FAMILY, CATEGORY_QUALIFIER, category); context.put(entity, FAMILY, RAW_ARTICLE_QUALIFIER, value.toString()); } }
/** * Post-processes rejected lines(Logging, keeping count, etc). * * @param line the line that was rejected by the producer. * @param context the context in which the rejection occured. * @param reason the reason why this line was rejected. */ public void reject(Text line, FijiTableContext context, String reason) { if (mRejectedLineCounter % mLogRate == 0L) { LOG.error("Rejecting line: {} with reason: {}", line.toString(), reason); } mRejectedLineCounter++; //TODO(FIJIMRLIB-9) Abort this bulk importer job early if rejected records exceed a threshold context.incrementCounter(JobHistoryCounters.BULKIMPORTER_RECORDS_REJECTED); //TODO(FIJIMRLIB-4) Allow this to emit to a rejected output so that import can be reattempted. }
/** * {@inheritDoc} * Cleans up job resources. * User overridden cleanup methods must contain super.cleanup(). */ @Override protected void cleanup(Context hadoopContext) throws IOException, InterruptedException { Preconditions.checkState(mTableContext != null); mTableContext.close(); mTableContext = null; super.cleanup(hadoopContext); }
/** {@inheritDoc} */ @Override public void produce(final FijiRowData row, final FijiTableContext context) throws IOException { final Iterable<FijiCell<Object>> cells; if (mColumn.isFullyQualified()) { cells = row.asIterable(mColumn.getFamily(), mColumn.getQualifier()); } else { cells = row.asIterable(mColumn.getFamily()); } for (FijiCell<Object> cell : cells) { context.incrementCounter(Counters.CELLS_PROCESSED); final DecodedCell<Object> original = new DecodedCell<Object>(cell.getWriterSchema(), cell.getData()); final DecodedCell<Object> rewritten = rewriteCell(original); if (rewritten != original) { context.put( row.getEntityId(), mColumn.getFamily(), mColumn.getQualifier(), cell.getTimestamp(), rewritten.getData()); context.incrementCounter(Counters.CELLS_REWRITTEN); } } }
/** {@inheritDoc} */ @Override public <T> void put(long timestamp, T value) throws IOException { Preconditions.checkNotNull(mEntityId); Preconditions.checkNotNull(mQualifier, "Producer output configured for a map-type family, use put(qualifier, timestamp, value)"); mTableContext.put(mEntityId, mFamily, mQualifier, timestamp, value); }
/** * Generates the entity id for this imported line using the source from the import descriptor. * Called within the produce() method. * * @param fields One line of input text split on the column delimiter. * @param context The context used by the produce() method. * @return The EntityId for the data that gets imported by this line. */ protected EntityId getEntityId(List<String> fields, FijiTableContext context) { //TODO(FIJIMRLIB-3) Extend this to support composite row key ids String rowkey = fields.get(mFieldMap.get(getEntityIdSource())); return context.getEntityId(rowkey); }
/** {@inheritDoc} */ @Override public void flush() throws IOException { mTableContext.flush(); super.flush(); }
/** {@inheritDoc} */ @Override public <T> void put(String qualifier, long timestamp, T value) throws IOException { Preconditions.checkNotNull(mEntityId); Preconditions.checkState(null == mQualifier, "Qualifier already specified by producer configuration."); mTableContext.put(mEntityId, mFamily, qualifier, timestamp, value); }
final EntityId user = context.getEntityId(firstName + "," + lastName); context.put(user, Fields.INFO_FAMILY, Fields.FIRST_NAME, firstName); context.put(user, Fields.INFO_FAMILY, Fields.LAST_NAME, lastName); context.put(user, Fields.INFO_FAMILY, Fields.EMAIL, email); context.put(user, Fields.INFO_FAMILY, Fields.TELEPHONE, telephone); context.put(user, Fields.INFO_FAMILY, Fields.ADDRESS, streetAddr);
/** * Post-processes incomplete lines(Logging, keeping count, etc). * * @param line the line that was marked incomplete incomplete by the producer. * @param context the context in which the incompletion occured. * @param reason the reason why this line was incomplete. */ public void incomplete(Text line, FijiTableContext context, String reason) { if (mIncompleteLineCounter % mLogRate == 0L) { LOG.error("Incomplete line: {} with reason: {}", line.toString(), reason); } mIncompleteLineCounter++; //TODO(FIJIMRLIB-9) Abort this bulk importer job early if incomplete records exceed a threshold context.incrementCounter(JobHistoryCounters.BULKIMPORTER_RECORDS_INCOMPLETE); //TODO(FIJIMRLIB-4) Add a strict mode where we reject incomplete lines }
/** {@inheritDoc} */ @Override public void close() throws IOException { mTableContext.close(); super.close(); } }
return; final EntityId eid = context.getEntityId(entityIdSource); String source = getSource(fijiColumnName); String fieldValue = getFromPath(gson, source); context.put(eid, family, qualifier, timestamp, convert(fijiColumnName, fieldValue)); } else { context.put(eid, family, qualifier, convert(fijiColumnName, fieldValue));
/** {@inheritDoc} */ @Override protected void map(FijiRowData input, Context context) throws IOException { Preconditions.checkNotNull(mContext); mPivoter.produce(input, mContext); mContext.incrementCounter(JobHistoryCounters.PIVOTER_ROWS_PROCESSED); }
/** {@inheritDoc} */ @Override protected void cleanup(Context context) throws IOException { Preconditions.checkNotNull(mTableContext); mBulkImporter.cleanup(mTableContext); mTableContext.close(); mTableContext = null; try { super.cleanup(context); } catch (InterruptedException ie) { throw new IOException(ie); } }
/** {@inheritDoc} */ @Override public void produce(Text value, FijiTableContext context) throws IOException { Map<Field, String> fieldMap; try { fieldMap = CommonLogParser.get().parseCommonLog(value.toString()); } catch (ParseException pe) { reject(value, context, "Unable to parse row: " + value.toString()); return; } Field entityIdSource = Field.valueOf(getEntityIdSource()); EntityId eid = context.getEntityId(fieldMap.get(entityIdSource)); for (FijiColumnName fijiColumnName : getDestinationColumns()) { Field source = Field.valueOf(getSource(fijiColumnName)); String fieldValue = fieldMap.get(source); if (fieldValue != null) { // TODO(FIJIMRLIB-12) Add some ability to use timestamps derived from the log file. context.put(eid, fijiColumnName.getFamily(), fijiColumnName.getQualifier(), fieldValue); } else { reject(value, context, "Log file missing field: " + source); } } } }
/** {@inheritDoc} */ @Override protected void cleanup(Context context) throws IOException { Preconditions.checkNotNull(mContext); mPivoter.cleanup(mContext); mContext.close(); mContext = null; super.cleanup(context); }
final EntityId eid = context.getEntityId(entityIdStr); String family = fijiColumnName.getFamily(); String qualifier = fijiColumnName.getQualifier(); context.put(eid, family, qualifier, timestamp, convert(fijiColumnName, fieldValue)); } else { incomplete(xmlText, context, "Detected missing field: " + source);
/** {@inheritDoc} */ @Override public void produce(ImmutableBytesWritable hbaseRowKey, Result hbaseRow, FijiTableContext context) throws IOException { EntityId entity = context.getEntityId(Bytes.toString(hbaseRowKey.get())); for (ColumnDescriptor columnDescriptor : mColumnDescriptors) { KeyValue keyValue = hbaseRow.getColumnLatest(columnDescriptor.getHBaseFamilyBytes(), columnDescriptor.getHBaseQualifierBytes()); if (null == keyValue) { // No data in this HTable column, skip it. continue; } // Convert the HBase cell to a Fiji cell. DecodedCell<?> fijiCell = decodeHBaseCell(columnDescriptor, keyValue.getValue()); // Write it at the same timestamp as the HBase cell. final String family = columnDescriptor.getFijiColumnName().getFamily(); final String qualifier = columnDescriptor.getFijiColumnName().getQualifier(); context.put(entity, family, qualifier, keyValue.getTimestamp(), fijiCell.getData()); } }