DataFrames en Scala

Crear DataFrames

import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType};
val data = List(
  Row("Paco","Garcia",24,24000),
  Row("Juan","Garcia",26,27000),
  Row("Lola","Martin",29,31000),
  Row("Sara","Garcia",35,34000)
)
val rdd = sc.parallelize(data)
val schema = StructType(
  List(
    StructField("nombre", StringType, nullable=false),
    StructField("apellido", StringType, nullable=false),
    StructField("edad", IntegerType),
    StructField("salario", IntegerType)
  )
)

val df = spark.createDataFrame(rdd,schema)

df.printSchema()

df.show()
root
 |-- nombre: string (nullable = false)
 |-- apellido: string (nullable = false)
 |-- edad: integer (nullable = true)
 |-- salario: integer (nullable = true)
 +------+--------+----+-------+
 |nombre|apellido|edad|salario|
 +------+--------+----+-------+
 | Paco | Garcia | 24 | 24000 |
 | Juan | Garcia | 26 | 27000 |
 | Lola | Martin | 29 | 31000 |
 | Sara | Garcia | 35 | 34000 |
 +------+--------+----+-------+

 

Transformar RDD a Dataframe

Transformar RDD a Dataframe especificando nombres de columnas

val nombre_cols=Array("id", "nombre", "valores")
val df=sc.parallelize(Seq(
(1,"Mario", Seq(0,2,5)),
(2,"Sonia", Seq(1,20,5)))).toDF(nombre_cols: _*)
df.show()
 +---+------+----------+
 | id|nombre| valores  |
 +---+------+----------+
 | 1| Mario | [0, 2, 5]|
 | 2| Sonia |[1, 20, 5]|
 +---+------+----------+

 

Transformar Dataset a Dataframe

import org.apache.spark.sql.functions._

val wordsDataset = sc.parallelize(
                     Seq("Hola mundo hola mundo", 
                         "ni hola ni mundo ni nada", 
                         "cuenta palabras"))
                    .toDS()

val result = wordsDataset
              .flatMap(_.split(" "))               // Dividir las frases en palabras
              .filter(_ != "")                     // Filtrar palabras vacias
              .map(_.toLowerCase())
              .toDF()                              // Convertir a DF para agregar y ordenar
              .groupBy($"value")                   // Contar ocurrencias de palabras
              .agg(count("*") as "ocurrencias")
              .orderBy($"ocurrencias" desc)        // Mostar la ocurrencia de cada palabra
result.show()
 +---------+------------+
 | value   | ocurrencias|
 +---------+------------+
 | nada    |  1         |
 | palabras|  1         |
 | cuenta  |  1         |
 | ni      |  3         |
 | hola    |  3         |
 | mundo   |  3         |
 +---------+------------+

Autor: Diego Calvo