DataFrame df; SQLContext sqlContext; Long start; Long end; JavaPairRDD<Row, Long> indexedRDD = df.toJavaRDD().zipWithIndex(); JavaRDD filteredRDD = indexedRDD.filter((Tuple2<Row,Long> v1) -> v1._2 >= start && v1._2 < end); DataFrame filteredDataFrame = sqlContext.createDataFrame(filteredRDD, df.schema());
public static List<Record> dataFrameToRecordsList(int tenantId, String tableName, DataFrame dataFrame) { Row[] rows = dataFrame.collect(); List<Record> records = new ArrayList<>(); StructType schema = dataFrame.schema(); for (Row row : rows) { records.add(new Record(tenantId, tableName, convertRowAndSchemaToValuesMap(row, schema))); } return records; }
private AnalyticsQueryResult toResult(DataFrame dataFrame) throws AnalyticsExecutionException { int resultsLimit = this.sparkConf.getInt("carbon.spark.results.limit", -1); if (resultsLimit != -1) { return new AnalyticsQueryResult(dataFrame.schema().fieldNames(), convertRowsToObjects(dataFrame.limit(resultsLimit).collect())); } else { return new AnalyticsQueryResult(dataFrame.schema().fieldNames(), convertRowsToObjects(dataFrame.collect())); } }
@Override public OneHotEncoderModelInfo getModelInfo(final OneHotEncoder from, DataFrame df) { OneHotEncoderModelInfo modelInfo = new OneHotEncoderModelInfo(); String inputColumn = from.getInputCol(); //Ugly but the only way to deal with spark here int numTypes = -1; Attribute attribute = Attribute.fromStructField(df.schema().apply(inputColumn)); if (attribute.attrType() == AttributeType.Nominal()) { numTypes = ((NominalAttribute) Attribute.fromStructField(df.schema().apply(inputColumn))).values().get().length; } else if (attribute.attrType() == AttributeType.Binary()) { numTypes = ((BinaryAttribute) Attribute.fromStructField(df.schema().apply(inputColumn))).values().get().length; } //TODO: Since dropLast is not accesible here, We are deliberately setting numTypes. This is the reason, we should use CustomOneHotEncoder modelInfo.setNumTypes(numTypes - 1); Set<String> inputKeys = new LinkedHashSet<String>(); inputKeys.add(from.getInputCol()); modelInfo.setInputKeys(inputKeys); Set<String> outputKeys = new LinkedHashSet<String>(); outputKeys.add(from.getOutputCol()); modelInfo.setOutputKeys(outputKeys); return modelInfo; }
private void writeDataFrameToDAL(DataFrame data) { if (this.preserveOrder) { logDebug("Inserting data with order preserved! Each partition will be written using separate jobs."); for (int i = 0; i < data.rdd().partitions().length; i++) { data.sqlContext().sparkContext().runJob(data.rdd(), new AnalyticsWritingFunction(this.tenantId, this.tableName, data.schema(), this.globalTenantAccess, this.schemaString, this.primaryKeys, this.mergeFlag, this.recordStore, this.recordBatchSize), CarbonScalaUtils.getNumberSeq(i, i + 1), false, ClassTag$.MODULE$.Unit()); } } else { data.foreachPartition(new AnalyticsWritingFunction(this.tenantId, this.tableName, data.schema(), this.globalTenantAccess, this.schemaString, this.primaryKeys, this.mergeFlag, this.recordStore, this.recordBatchSize)); } }