Spark: clean way to fill nulls based on schema

In [1]: from pyspark.sql.types import StructField, StructType, StringType
   ...: from pyspark.sql.functions import col, from_json

In [2]: schema = StructType([
   ...:     StructField("a", StringType()),
   ...:     StructField("b", StringType()),
   ...:     StructField("c", StringType()),
   ...:     StructField("d", StringType()),
   ...: ])

In [3]: df = spark.createDataFrame([("1", '{"a": 1, "b": 2}'),
   ...:                             ("2", '{"a": 3, "c": 4}')],
   ...:                            schema=["Some col", "body"])

In [4]: df.show()
+--------+----------------+
|Some col|            body|
+--------+----------------+
|       1|{"a": 1, "b": 2}|
|       2|{"a": 3, "c": 4}|
+--------+----------------+

In [5]: df.select(from_json(col("body"), schema).alias("data")).select("data.*").show()
+---+----+----+----+
|  a|   b|   c|   d|
+---+----+----+----+
|  1|   2|null|null|
|  3|null|   4|null|
+---+----+----+----+

CLICK HERE to find out more related problems solutions.

Leave a Comment

Your email address will not be published.

Scroll to Top