how do i separate specific chars from a column of a pyspark dataframe and form a new column using them?

You can either use an UDF to do this. Or you can use the built-in regex_extract function to extract

from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark import SparkContext, SQLContext
from pyspark.sql.functions import *

sc = SparkContext('local')
sqlContext = SQLContext(sc)

data1 = [
    ('A123',),
    ('321B',),
    ('C875',) ,
      ]


df1Columns = ["column1"]
df1 = sqlContext.createDataFrame(data=data1, schema = df1Columns)
df1.show(20, truncate=False)


import re

def findChar(input_string):
    result = "".join(re.findall("[A-C]", input_string))
    return result

findChar_udf = udf(findChar, StringType())

def findNumber(input_string):
    result = "".join(re.findall("[0-9]+", input_string))
    return result

findNumber_udf = udf(findNumber, StringType())

print("Using udf")
df2 = df1.withColumn("column2", findChar_udf("column1")).withColumn("column3", findNumber_udf("column1"))
df2.show(20, truncate=False)

print("Using regex_extract")
df3 = df1.withColumn("column2", regexp_extract("column1", "[A-C]", 0)).withColumn("column3", regexp_extract("column1", "[0-9]+", 0))
df3.show(20, truncate=False)

Following is the output:

+-------+
|column1|
+-------+
|A123   |
|321B   |
|C875   |
+-------+

Using udf
+-------+-------+-------+
|column1|column2|column3|
+-------+-------+-------+
|A123   |A      |123    |
|321B   |B      |321    |
|C875   |C      |875    |
+-------+-------+-------+

Using regex_extract
+-------+-------+-------+
|column1|column2|column3|
+-------+-------+-------+
|A123   |A      |123    |
|321B   |B      |321    |
|C875   |C      |875    |
+-------+-------+-------+

CLICK HERE to find out more related problems solutions.

Leave a Comment

Your email address will not be published.

Scroll to Top