public BooleanAnalyzerColumnDelegate(final RowAnnotationFactory annotationFactory) { _annotationFactory = annotationFactory; _nullAnnotation = _annotationFactory.createAnnotation(); _trueAnnotation = _annotationFactory.createAnnotation(); _falseAnnotation = _annotationFactory.createAnnotation(); }
public StringAnalyzerColumnDelegate(final RowAnnotationFactory annotationFactory) { _annotationFactory = annotationFactory; _nullAnnotation = annotationFactory.createAnnotation(); _blankAnnotation = annotationFactory.createAnnotation(); _entirelyUppercaseAnnotation = annotationFactory.createAnnotation(); _entirelyLowercaseAnnotation = annotationFactory.createAnnotation(); _maxCharsAnnotation = annotationFactory.createAnnotation(); _minCharsAnnotation = annotationFactory.createAnnotation(); _maxWhitespaceAnnotation = annotationFactory.createAnnotation(); _minWhitespaceAnnotation = annotationFactory.createAnnotation(); _uppercaseExclFirstLetterAnnotation = annotationFactory.createAnnotation(); _digitAnnotation = annotationFactory.createAnnotation(); _diacriticAnnotation = annotationFactory.createAnnotation(); _maxWordsAnnotation = annotationFactory.createAnnotation(); _minWordsAnnotation = annotationFactory.createAnnotation(); }
public DateAndTimeAnalyzerColumnDelegate(final boolean descriptiveStatistics, final RowAnnotationFactory annotationFactory) { _annotationFactory = annotationFactory; _nullAnnotation = _annotationFactory.createAnnotation(); _maxDateAnnotation = _annotationFactory.createAnnotation(); _minDateAnnotation = _annotationFactory.createAnnotation(); _maxTimeAnnotation = _annotationFactory.createAnnotation(); _minTimeAnnotation = _annotationFactory.createAnnotation(); _numRows = 0; if (descriptiveStatistics) { _statistics = new DescriptiveStatistics(); } else { _statistics = new SummaryStatistics(); } }
public CharacterSetDistributionAnalyzerColumnDelegate(final RowAnnotationFactory annotationFactory, final Map<String, UnicodeSet> unicodeSets) { _annotationFactory = annotationFactory; _unicodeSets = unicodeSets; _annotations = new HashMap<>(); for (final String name : unicodeSets.keySet()) { _annotations.put(name, _annotationFactory.createAnnotation()); } }
public NumberAnalyzerColumnDelegate(final boolean descriptiveStatistics, final RowAnnotationFactory annotationFactory) { _annotationFactory = annotationFactory; _nullAnnotation = _annotationFactory.createAnnotation(); _maxAnnotation = _annotationFactory.createAnnotation(); _minAnnotation = _annotationFactory.createAnnotation(); if (descriptiveStatistics) { _statistics = new DescriptiveStatistics(); } else { _statistics = new SummaryStatistics(); } }
private RowAnnotation getAnnotation(final Number x, final Number y) { if (_annotations.containsKey(new ImmutablePair<>(x, y))) { return _annotations.get(new ImmutablePair<>(x, y)); } else { final RowAnnotation annotation = _rowAnnotationFactory.createAnnotation(); _annotations.put(new ImmutablePair<>(x, y), annotation); return annotation; } }
/** * Constructor for direct usage within e.g. other components where we always * expect to do LEFT JOIN (max one record) semantic lookups. * * @param datastore * @param schemaName * @param tableName * @param conditionColumns * @param conditionValues * @param outputColumns * @param joinSemantic * @param cacheLookups */ public TableLookupTransformer(final Datastore datastore, final String schemaName, final String tableName, final String[] conditionColumns, final InputColumn<?>[] conditionValues, final String[] outputColumns, final boolean cacheLookups) { this.datastore = datastore; this.schemaName = schemaName; this.tableName = tableName; this.conditionColumns = conditionColumns; this.conditionValues = conditionValues; this.cacheLookups = cacheLookups; this.outputColumns = outputColumns; this.joinSemantic = JoinSemantic.LEFT_JOIN_MAX_ONE; _annotationFactory = new DummyRowAnnotationFactory(); _matches = _annotationFactory.createAnnotation(); _cached = _annotationFactory.createAnnotation(); _misses = _annotationFactory.createAnnotation(); }
public RowAnnotation getAnnotation() { if (_annotation == null) { // only occurs for deserialized instances return RowAnnotations.getDefaultFactory().createAnnotation(); } return _annotation; }
@Override protected void storeNewPattern(final TokenPattern pattern, final InputRow row, final String value, final int distinctCount) { final RowAnnotation annotation = _annotationFactory.createAnnotation(); _annotations.put(pattern, annotation); _annotationFactory.annotate(row, distinctCount, annotation); }
public void addObservation(InputRow row, List<Object> inspectionOutcomes) { RowAnnotation annotation = _observations.get(inspectionOutcomes); if (annotation == null) { synchronized (this) { annotation = _observations.get(inspectionOutcomes); if (annotation == null) { annotation = _rowAnnotationFactory.createAnnotation(); _observations.put(inspectionOutcomes, annotation); } } } _rowAnnotationFactory.annotate(row, annotation); }
public ValueDistributionGroup(final String groupName, final RowAnnotationFactory annotationFactory, final boolean recordAnnotations, final InputColumn<?>[] inputColumns) { _groupName = groupName; _annotationFactory = annotationFactory; _recordAnnotations = recordAnnotations; _inputColumns = inputColumns; _totalCount = new AtomicInteger(); _annotationMap = new HashMap<>(); if (recordAnnotations) { _nullValueAnnotation = _annotationFactory.createAnnotation(); } else { _nullValueAnnotation = new RowAnnotationImpl(); } }
public void addObservation(InputRow row, List<Object> inspectionOutcomes) { RowAnnotation annotation = _observations.get(inspectionOutcomes); if (annotation == null) { synchronized (this) { annotation = _observations.get(inspectionOutcomes); if (annotation == null) { annotation = _rowAnnotationFactory.createAnnotation(); _observations.put(inspectionOutcomes, annotation); } } } _rowAnnotationFactory.annotate(row, annotation); }
@Initialize public void init() { _totalCount = new AtomicInteger(); _valueAnnotations = new ConcurrentHashMap<>(); for (final String value : expectedValues) { final RowAnnotation annotation = _rowAnnotationFactory.createAnnotation(); final String lookupValue = getLookupValue(value); _valueAnnotations.put(lookupValue, annotation); } }
@Override public void run(final InputRow row, final int distinctCount) { final Boolean[] values = new Boolean[_columns.length]; for (int i = 0; i < values.length; i++) { final InputColumn<Boolean> col = _columns[i]; final Boolean value = row.getValue(col); final BooleanAnalyzerColumnDelegate delegate = _columnDelegates.get(col); values[i] = value; delegate.run(value, row, distinctCount); } // collect all combinations of booleans if (_columns.length > 1) { final ValueCombination<Boolean> valueCombination = new ValueCombination<>(values); RowAnnotation annotation = _valueCombinations.get(valueCombination); if (annotation == null) { annotation = _annotationFactory.createAnnotation(); _valueCombinations.put(valueCombination, annotation); } _annotationFactory.annotate(row, distinctCount, annotation); } }
@Override public CategorizationResult reduce(final Collection<? extends CategorizationResult> results) { final RowAnnotationFactory annotationFactory = null; final Map<String, RowAnnotation> reducedCategories = new LinkedHashMap<>(); for (final CategorizationResult result : results) { final Collection<String> categoryNames = result.getCategoryNames(); for (final String categoryName : categoryNames) { final RowAnnotation partialAnnotation = result.getCategoryRowAnnotation(categoryName); final RowAnnotation reducedAnnotation = reducedCategories.get(categoryName); if (reducedAnnotation == null) { // adopt the annotation from the partial result final RowAnnotation annotation = _rowAnnotationFactory.createAnnotation(); _rowAnnotationFactory.transferAnnotations(partialAnnotation, annotation); reducedCategories.put(categoryName, annotation); } else { // add records to the existing annotation _rowAnnotationFactory.transferAnnotations(partialAnnotation, reducedAnnotation); } } } return new CategorizationResult(annotationFactory, reducedCategories); }
final RowAnnotation nullAnnotation = _rowAnnotationFactory.createAnnotation(); final RowAnnotation unexpectedValuesAnnotation = _rowAnnotationFactory.createAnnotation(); final Map<String, RowAnnotation> valueAnnotations = new HashMap<>(); RowAnnotation masterAnnotation = valueAnnotations.get(expectedValue); if (masterAnnotation == null) { masterAnnotation = _rowAnnotationFactory.createAnnotation(); valueAnnotations.put(expectedValue, masterAnnotation);
public void run(final InputRow row, final String value, final int distinctCount) { if (value == null) { if (_recordAnnotations) { _annotationFactory.annotate(row, distinctCount, _nullValueAnnotation); } else { ((RowAnnotationImpl) _nullValueAnnotation).incrementRowCount(distinctCount); } } else { RowAnnotation annotation; synchronized (this) { annotation = _annotationMap.get(value); if (annotation == null) { if (_recordAnnotations) { annotation = _annotationFactory.createAnnotation(); } else { annotation = new RowAnnotationImpl(); } _annotationMap.put(value, annotation); } } if (_recordAnnotations) { _annotationFactory.annotate(row, distinctCount, annotation); } else { ((RowAnnotationImpl) annotation).incrementRowCount(distinctCount); } } _totalCount.addAndGet(distinctCount); }
countryCountMap.put(correctedCountryName, _rowAnnotationFactory.createAnnotation());
@Override public CompletenessAnalyzerResult reduce(final Collection<? extends CompletenessAnalyzerResult> results) { final CompletenessAnalyzerResult firstResult = results.iterator().next(); final RowAnnotation annotation = _rowAnnotationFactory.createAnnotation(); final InputColumn<?>[] highlightedColumns = firstResult.getHighlightedColumns(); int totalRowCount = 0; for (final CompletenessAnalyzerResult result : results) { final List<InputRow> sampleRows = result.getSampleRows(); final int invalidRowCount = result.getInvalidRowCount(); if (invalidRowCount == sampleRows.size()) { // if the rows are included for preview/sampling - then // re-annotate them in the master result for (final InputRow sampleRow : sampleRows) { _rowAnnotationFactory.annotate(sampleRow, annotation); } } else { // else we just transfer annotation counts _rowAnnotationFactory.transferAnnotations(result.getAnnotation(), annotation); } totalRowCount += result.getTotalRowCount(); } return new CompletenessAnalyzerResult(totalRowCount, annotation, _rowAnnotationFactory, highlightedColumns); }
return _rowAnntationFactoryRef.get(); } else if (baseType == RowAnnotation.class) { return _rowAnntationFactoryRef.get().createAnnotation(); } else if (baseType == DataCleanerConfiguration.class) { return getConfiguration();