/** * Filter out HoodieRecords that already exists in the output folder. This is useful in * deduplication. * * @param hoodieRecords Input RDD of Hoodie records. * @return A subset of hoodieRecords RDD, with existing records filtered out. */ public JavaRDD<HoodieRecord<T>> filterExists(JavaRDD<HoodieRecord<T>> hoodieRecords) { JavaRDD<HoodieRecord<T>> recordsWithLocation = tagLocation(hoodieRecords); return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown()); }
/** * Filter out HoodieRecords that already exists in the output folder. This is useful in * deduplication. * * @param hoodieRecords Input RDD of Hoodie records. * @return A subset of hoodieRecords RDD, with existing records filtered out. */ public JavaRDD<HoodieRecord<T>> filterExists(JavaRDD<HoodieRecord<T>> hoodieRecords) { JavaRDD<HoodieRecord<T>> recordsWithLocation = tagLocation(hoodieRecords); return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown()); }
@SuppressWarnings("unchecked") public static JavaRDD<HoodieRecord> dropDuplicates(JavaSparkContext jssc, JavaRDD<HoodieRecord> incomingHoodieRecords, HoodieWriteConfig writeConfig) throws Exception { try { HoodieReadClient client = new HoodieReadClient<>(jssc, writeConfig); return client.tagLocation(incomingHoodieRecords) .filter(r -> !((HoodieRecord<HoodieRecordPayload>) r).isCurrentLocationKnown()); } catch (DatasetNotFoundException e) { // this will be executed when there is no hoodie dataset yet // so no dups to drop return incomingHoodieRecords; } }
/** * Filter out HoodieRecords that already exists in the output folder. This is useful in * deduplication. * * @param hoodieRecords Input RDD of Hoodie records. * @return A subset of hoodieRecords RDD, with existing records filtered out. */ public JavaRDD<HoodieRecord<T>> filterExists(JavaRDD<HoodieRecord<T>> hoodieRecords) { // Create a Hoodie table which encapsulated the commits and files visible HoodieTable<T> table = HoodieTable.getHoodieTable( new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config, jsc); JavaRDD<HoodieRecord<T>> recordsWithLocation = index.tagLocation(hoodieRecords, jsc, table); return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown()); }
/** * Filter out HoodieRecords that already exists in the output folder. This is useful in * deduplication. * * @param hoodieRecords Input RDD of Hoodie records. * @return A subset of hoodieRecords RDD, with existing records filtered out. */ public JavaRDD<HoodieRecord<T>> filterExists(JavaRDD<HoodieRecord<T>> hoodieRecords) { // Create a Hoodie table which encapsulated the commits and files visible HoodieTable<T> table = HoodieTable.getHoodieTable( new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config, jsc); JavaRDD<HoodieRecord<T>> recordsWithLocation = index.tagLocation(hoodieRecords, jsc, table); return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown()); }
/** * Ensure records have location field set * * @param taggedRecords Tagged Records * @param commitTime Commit Timestamp */ void checkTaggedRecords(List<HoodieRecord> taggedRecords, String commitTime) { for (HoodieRecord rec : taggedRecords) { assertTrue("Record " + rec + " found with no location.", rec.isCurrentLocationKnown()); assertEquals("All records should have commit time " + commitTime + ", since updates were made", rec.getCurrentLocation().getCommitTime(), commitTime); } }
assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename2))); } else if (record.getRecordKey().equals("002")) { assertTrue(!record.isCurrentLocationKnown()); } else if (record.getRecordKey().equals("004")) { assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename3)));
assert (javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 0); assert (javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 0); hoodieTable = HoodieTable.getHoodieTable(metaClient, config, jsc); javaRDD = index.tagLocation(writeRecords, jsc, hoodieTable); assertTrue(javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 200); assertTrue(javaRDD.map(record -> record.getKey().getRecordKey()).distinct().count() == 200); assertTrue(javaRDD.filter(
assert (javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 200); assert (javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 0); assert (javaRDD.filter(record -> record.getCurrentLocation() != null).collect().size() == 0);
assertTrue(!record.isCurrentLocationKnown()); assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename2))); } else if (record.getRecordKey().equals("3eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) { assertTrue(!record.isCurrentLocationKnown()); } else if (record.getRecordKey().equals("4eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) { assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename3)));
assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename))); } else if (record.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) { assertFalse(record.isCurrentLocationKnown());