@Override public void process(InputStream in) throws IOException { avroSchema.set(CSVUtil .inferSchema( context.getProperty(RECORD_NAME).evaluateAttributeExpressions(inputFlowFile).getValue(), in, props) .toString(context.getProperty(PRETTY_AVRO_OUTPUT).asBoolean())); } });
boolean makeNullable) throws IOException { CSVReader reader = newReader(incoming, props); header = newParser(props).parseLine(props.header); line = reader.readNext(); Preconditions.checkNotNull(line, "No content to infer schema"); if (i < line.length) { if (types[i] == null) { types[i] = inferFieldType(line[i]); if (types[i] != null) { .doc("Type inferred from '" + sample(values[i]) + "'") .type(schema(types[i], false)).noDefault(); } else { SchemaBuilder.GenericDefault<Schema> defaultBuilder = fieldAssembler.name(fieldName) .doc("Type inferred from '" + sample(values[i]) + "'") .type(schema(types[i], makeNullable || foundNull)); if (makeNullable || foundNull) { fieldAssembler = defaultBuilder.withDefault(null);
@Test public void testSamplePrintableCharactersNotChanged() { String upper = "ABCDEFGHIJKLMNOPQRXTUVWXYZ"; Assert.assertEquals("Upper case letters shouldn't be removed", upper, CSVUtil.sample(upper)); String lower = "abcdefghijklmnopqrstuvwxyz"; Assert.assertEquals("Lower case letters shouldn't be removed", lower, CSVUtil.sample(lower)); String numbers = "0123456789"; Assert.assertEquals("Numbers shouldn't be removed", numbers, CSVUtil.sample(numbers)); String punctuation = " _-~+!@#$%^&*(){}[]<>,.?:;`'\"/\\|"; Assert.assertEquals("Punctuation shouldn't be removed", punctuation, CSVUtil.sample(punctuation)); }
.inferNullableSchema( recordName, open(samplePaths.get(0)), props, required) .toString(!minimize);
public CSVRecordParser(CSVProperties props, Schema schema, Class<E> type, @Nullable List<String> header) { this.parser = CSVUtil.newParser(props); this.builder = new CSVRecordBuilder<E>( DataModelUtil.getReaderSchema(type, schema), type, getHeader(props, header)); }
public static Schema inferSchema(String name, InputStream incoming, CSVProperties props, Set<String> requiredFields) throws IOException { return inferSchemaInternal(name, incoming, props, requiredFields, false); }
@Override public void open() throws IOException { this.outgoing = fs.create(path, true /* overwrite */); this.writer = CSVUtil.newWriter(outgoing, props); }
@Test public void testSampleNull() { String nullString = null; Assert.assertEquals("Should handle null like String.valueOf", String.valueOf(nullString), CSVUtil.sample(nullString)); } }
@Test public void testSchemaNamespace() throws Exception { InputStream stream = new ByteArrayInputStream(csvLines.getBytes("utf8")); Schema schema = CSVUtil.inferNullableSchema("com.example.TestRecord", stream, new CSVProperties.Builder().hasHeader().build()); Assert.assertEquals("Should use name", "TestRecord", schema.getName()); Assert.assertEquals("Should set namespace", "com.example", schema.getNamespace()); }
public static List<String> getHeader(CSVProperties props, @Nullable List<String> header) { if (header != null) { return header; } else if (props.header != null) { try { return Lists.newArrayList( CSVUtil.newParser(props).parseLine(props.header)); } catch (IOException e) { throw new DatasetIOException( "Failed to parse header from properties: " + props.header, e); } } return null; } }
public static Schema inferNullableSchema(String name, InputStream incoming, CSVProperties props, Set<String> requiredFields) throws IOException { return inferSchemaInternal(name, incoming, props, requiredFields, true); }
@Override public void process(InputStream in) throws IOException { avroSchema.set(CSVUtil .inferSchema( context.getProperty(RECORD_NAME).evaluateAttributeExpressions(inputFlowFile).getValue(), in, props) .toString(context.getProperty(PRETTY_AVRO_OUTPUT).asBoolean())); } });
@Test public void testUnicodeRemoved() { String hasUnicode = "Unicode snowflake: \u2744"; Assert.assertEquals("Should remove unicode", "Unicode snowflake: .", CSVUtil.sample(hasUnicode)); }
@Test public void testNullableSchemaInference() throws Exception { InputStream stream = new ByteArrayInputStream(csvLines.getBytes("utf8")); Schema schema = CSVUtil.inferNullableSchema("TestRecord", stream, new CSVProperties.Builder().hasHeader().build(), ImmutableSet.of("float"));
@Override public void run() { try { CSVUtil.inferSchema("TestRecord", new ByteArrayInputStream(csvLines.getBytes("utf8")), new CSVProperties.Builder().hasHeader().build(), ImmutableSet.of("nullable_string")); } catch (IOException e) { throw new RuntimeException("Schema inference threw IOException", e); } } });
@Test public void testSampleTruncated() { String longUrl = "https://github.com/kite-sdk/kite/commit/" + "bbe3e917875e879ca58b8afe90efa96cdd4691d1"; Assert.assertEquals("Should truncate long values", "https://github.com/kite-sdk/kite/commit/bbe3e91787", CSVUtil.sample(longUrl)); }
@Test public void testNullableSchemaInferenceWithoutHeader() throws Exception { InputStream stream = new ByteArrayInputStream(csvLines.getBytes("utf8")); Schema schema = CSVUtil.inferNullableSchema("TestRecord", stream, new CSVProperties.Builder().build(), ImmutableSet.of("long", "field_1"));
@Override public void run() { try { CSVUtil.inferSchema("TestRecord", new ByteArrayInputStream(csvLines.getBytes("utf8")), new CSVProperties.Builder().hasHeader().build(), ImmutableSet.of("nullable_long")); } catch (IOException e) { throw new RuntimeException("Schema inference threw IOException", e); } } });
@Test public void testSchemaInferenceMissingExample() throws Exception { InputStream stream = new ByteArrayInputStream( "\none,two\n34,\n".getBytes("utf8")); Schema schema = CSVUtil.inferSchema("TestRecord", stream, new CSVProperties.Builder().linesToSkip(1).hasHeader().build()); Assert.assertNotNull(schema.getField("one")); Assert.assertNotNull(schema.getField("two")); Assert.assertEquals("Should infer a long", schema(Schema.Type.LONG), schema.getField("one").schema()); Assert.assertEquals("Should default to a string", nullable(Schema.Type.STRING), schema.getField("two").schema()); }