You can either use an UDF to do this. Or you can use the built-in regex_extract
function to extract
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark import SparkContext, SQLContext
from pyspark.sql.functions import *
sc = SparkContext('local')
sqlContext = SQLContext(sc)
data1 = [
('A123',),
('321B',),
('C875',) ,
]
df1Columns = ["column1"]
df1 = sqlContext.createDataFrame(data=data1, schema = df1Columns)
df1.show(20, truncate=False)
import re
def findChar(input_string):
result = "".join(re.findall("[A-C]", input_string))
return result
findChar_udf = udf(findChar, StringType())
def findNumber(input_string):
result = "".join(re.findall("[0-9]+", input_string))
return result
findNumber_udf = udf(findNumber, StringType())
print("Using udf")
df2 = df1.withColumn("column2", findChar_udf("column1")).withColumn("column3", findNumber_udf("column1"))
df2.show(20, truncate=False)
print("Using regex_extract")
df3 = df1.withColumn("column2", regexp_extract("column1", "[A-C]", 0)).withColumn("column3", regexp_extract("column1", "[0-9]+", 0))
df3.show(20, truncate=False)
Following is the output:
+-------+
|column1|
+-------+
|A123 |
|321B |
|C875 |
+-------+
Using udf
+-------+-------+-------+
|column1|column2|column3|
+-------+-------+-------+
|A123 |A |123 |
|321B |B |321 |
|C875 |C |875 |
+-------+-------+-------+
Using regex_extract
+-------+-------+-------+
|column1|column2|column3|
+-------+-------+-------+
|A123 |A |123 |
|321B |B |321 |
|C875 |C |875 |
+-------+-------+-------+
CLICK HERE to find out more related problems solutions.