public static boolean isEmptySchema(StructType schema) { return schema == null || schema.fieldNames() == null || schema.fieldNames().length == 0; }
@Test public void testCrosstab() { Dataset<Row> df = spark.table("testData2"); Dataset<Row> crosstab = df.stat().crosstab("a", "b"); String[] columnNames = crosstab.schema().fieldNames(); Assert.assertEquals("a_b", columnNames[0]); Assert.assertEquals("1", columnNames[1]); Assert.assertEquals("2", columnNames[2]); List<Row> rows = crosstab.collectAsList(); rows.sort(crosstabRowComparator); Integer count = 1; for (Row row : rows) { Assert.assertEquals(row.get(0).toString(), count.toString()); Assert.assertEquals(1L, row.getLong(1)); Assert.assertEquals(1L, row.getLong(2)); count++; } }
@Test public void testCrosstab() { Dataset<Row> df = spark.table("testData2"); Dataset<Row> crosstab = df.stat().crosstab("a", "b"); String[] columnNames = crosstab.schema().fieldNames(); Assert.assertEquals("a_b", columnNames[0]); Assert.assertEquals("1", columnNames[1]); Assert.assertEquals("2", columnNames[2]); List<Row> rows = crosstab.collectAsList(); rows.sort(crosstabRowComparator); Integer count = 1; for (Row row : rows) { Assert.assertEquals(row.get(0).toString(), count.toString()); Assert.assertEquals(1L, row.getLong(1)); Assert.assertEquals(1L, row.getLong(2)); count++; } }
@Test public void testCrosstab() { Dataset<Row> df = spark.table("testData2"); Dataset<Row> crosstab = df.stat().crosstab("a", "b"); String[] columnNames = crosstab.schema().fieldNames(); Assert.assertEquals("a_b", columnNames[0]); Assert.assertEquals("1", columnNames[1]); Assert.assertEquals("2", columnNames[2]); List<Row> rows = crosstab.collectAsList(); rows.sort(crosstabRowComparator); Integer count = 1; for (Row row : rows) { Assert.assertEquals(row.get(0).toString(), count.toString()); Assert.assertEquals(1L, row.getLong(1)); Assert.assertEquals(1L, row.getLong(2)); count++; } }
private Set<String> getColumnFamilies(Row row) { Set<String> families = Sets.newHashSet(); for (String fieldName : row.schema().fieldNames()) { ColumnDef def = columns.get(fieldName); if (!def.cf.equals("rowkey")) { families.add(def.cf); } } return families; }
public static Boolean validateSchemaColumns(StructType sparkSchema, AnalyticsSchema analyticsSchema) { String[] rddCols = sparkSchema.fieldNames(); Set<String> temp = analyticsSchema.getColumns().keySet(); String[] tableCols = temp.toArray(new String[temp.size()]); return Arrays.equals(rddCols, tableCols); }
private boolean filtersEntireRowKey(Row row) { for (String keyColumn : keyColumns) { if (!Arrays.asList(row.schema().fieldNames()).contains(keyColumn)) { return false; } } return true; }
/** * True if the provided row contains a mutation type field. */ public static boolean hasMutationTypeField(Row row) { for (String fieldName : row.schema().fieldNames()) { if (fieldName.equals(MutationType.MUTATION_TYPE_FIELD_NAME)) { return true; } } return false; }
private boolean filtersRowKeyPrefix(Row row) { Set<String> rowColumnNames = Sets.newHashSet(row.schema().fieldNames()); Set<String> prefixColumnNames = Sets.newHashSet(keyColumns.subList(0, rowColumnNames.size())); return rowColumnNames.equals(prefixColumnNames); }
public static Map<String, Object> convertRowAndSchemaToValuesMap(Row row, StructType schema) { String[] colNames = schema.fieldNames(); Map<String, Object> result = new HashMap<>(); for (int i = 0; i < row.length(); i++) { result.put(colNames[i], row.get(i)); } return result; }
private Stream<String> getPropertyNames(final Row row) { return Arrays.stream(row.schema().fieldNames()) .filter(f -> !ReservedPropertyNames.contains(f)) .filter(n -> !row.isNullAt(row.fieldIndex(n))); } }
public static DataSetResult getDataSetResult(Dataset<Row> df) { DataSetResult result = new DataSetResult(); String[] fieldNames = df.schema().fieldNames(); result.getColumnNames().addAll(Arrays.asList(fieldNames)); Row[] rows = (Row[]) df.collect(); for (Row row : rows) { List<Object> values = new ArrayList<>(); for (int i = 0; i < fieldNames.length; i++) { Object obj = row.get(i); values.add(obj); } result.getRows().add(values); } return result; }
public Dataset<Row> alignColumns(Dataset<Row> input) { Boolean caseSensitive = Contexts.getSparkSession().sparkContext().getConf(). getBoolean(SPARK_SQL_CASE_SENSITIVE_CONFIG, false); Set<String> inputCols = new HashSet<String>(); for (String col : Arrays.asList(input.schema().fieldNames())) { inputCols.add((caseSensitive) ? col : col.toLowerCase()); } List<String> tableCols = new ArrayList<String>(); for (String col : Contexts.getSparkSession().table(tableName).schema().fieldNames()) { tableCols.add((caseSensitive) ? col : col.toLowerCase()); } List<Column> alignedCols = new ArrayList<Column>(); for (String column : tableCols) { alignedCols.add((inputCols.contains(column)) ? functions.col(column) : functions.lit(null).alias(column)); } return input.select(alignedCols.toArray(new Column[alignedCols.size()])); }
private boolean matchesValueFilter(Row row, Row filter) { for (String filterFieldName : filter.schema().fieldNames()) { Object rowValue = row.get(row.fieldIndex(filterFieldName)); Object filterValue = RowUtils.get(filter, filterFieldName); if (!rowValue.equals(filterValue)) { return false; } } return true; }
public static Row subsetRow(Row row, StructType subsetSchema) { Object[] values = new Object[subsetSchema.length()]; int i = 0; for (String fieldName : subsetSchema.fieldNames()) { values[i] = row.get(row.fieldIndex(fieldName)); i++; } Row subset = new RowWithSchema(subsetSchema, values); return subset; }
public static RowBasedKeyValueBatch allocate(StructType keySchema, StructType valueSchema, TaskMemoryManager manager, int maxRows) { boolean allFixedLength = true; // checking if there is any variable length fields // there is probably a more succinct impl of this for (String name : keySchema.fieldNames()) { allFixedLength = allFixedLength && UnsafeRow.isFixedLength(keySchema.apply(name).dataType()); } for (String name : valueSchema.fieldNames()) { allFixedLength = allFixedLength && UnsafeRow.isFixedLength(valueSchema.apply(name).dataType()); } if (allFixedLength) { return new FixedLengthRowBasedKeyValueBatch(keySchema, valueSchema, maxRows, manager); } else { return new VariableLengthRowBasedKeyValueBatch(keySchema, valueSchema, maxRows, manager); } }
private AnalyticsQueryResult toResult(DataFrame dataFrame) throws AnalyticsExecutionException { int resultsLimit = this.sparkConf.getInt("carbon.spark.results.limit", -1); if (resultsLimit != -1) { return new AnalyticsQueryResult(dataFrame.schema().fieldNames(), convertRowsToObjects(dataFrame.limit(resultsLimit).collect())); } else { return new AnalyticsQueryResult(dataFrame.schema().fieldNames(), convertRowsToObjects(dataFrame.collect())); } }
public static RowBasedKeyValueBatch allocate(StructType keySchema, StructType valueSchema, TaskMemoryManager manager, int maxRows) { boolean allFixedLength = true; // checking if there is any variable length fields // there is probably a more succinct impl of this for (String name : keySchema.fieldNames()) { allFixedLength = allFixedLength && UnsafeRow.isFixedLength(keySchema.apply(name).dataType()); } for (String name : valueSchema.fieldNames()) { allFixedLength = allFixedLength && UnsafeRow.isFixedLength(valueSchema.apply(name).dataType()); } if (allFixedLength) { return new FixedLengthRowBasedKeyValueBatch(keySchema, valueSchema, maxRows, manager); } else { return new VariableLengthRowBasedKeyValueBatch(keySchema, valueSchema, maxRows, manager); } }
/** * Returns a copy of intoRow where its time (as defined by intoRowModel) * is overwritten with the time from fromRow (as defined by fromTimeModel). * The two time models must be of the same type, but can be different * instances. This method does not modify fromRow or intoRow. */ public static Row copyTime(Row fromRow, TimeModel fromTimeModel, Row into, TimeModel intoTimeModel) { assertCompatibleTimeModels(fromTimeModel, intoTimeModel); Row fromTime = fromTimeModel.getTime(fromRow); for (int fieldNum = 0; fieldNum < fromTimeModel.getSchema().size(); fieldNum++) { into = RowUtils.set(into, intoTimeModel.getSchema().fieldNames()[fieldNum], fromTime.get(fieldNum)); } return into; }
/** * Returns a copy of intoRow where its time (as defined by intoRowModel) * is overwritten with the preceding time from fromRow (as defined by * fromTimeModel). The two time models must be of the same type, but can * be different instances. This method does not modify fromRow or intoRow. */ public static Row copyPrecedingTime(Row fromRow, TimeModel fromTimeModel, Row into, TimeModel intoTimeModel) { assertCompatibleTimeModels(fromTimeModel, intoTimeModel); Row fromTime = fromTimeModel.getPrecedingTime(fromRow); for (int fieldNum = 0; fieldNum < fromTimeModel.getSchema().size(); fieldNum++) { into = RowUtils.set(into, intoTimeModel.getSchema().fieldNames()[fieldNum], fromTime.get(fieldNum)); } return into; }