有时候需要对某列字符串建立索引,然后还原
from pyspark.ml.feature import IndexToString, StringIndexer
from pyspark.sql import SparkSession
spark = SparkSession\
.builder\
.appName("IndexToStringExample")\
.getOrCreate()
df = spark.createDataFrame(
[(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
["id", "category"])
# 对字符串建立索引(按出现频率大小给定0,1,2...)
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
model = indexer.fit(df)
indexed = model.transform(df)
print("Transformed string column '%s' to indexed column '%s'"
% (indexer.getInputCol(), indexer.getOutputCol()))
indexed.show()
print("StringIndexer will store labels in output column metadata\n")
# 将索引还原为字符串
converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory")
converted = converter.transform(indexed)
print("Transformed indexed column '%s' back to original string column '%s' using "
"labels in metadata" % (converter.getInputCol(), converter.getOutputCol()))
converted.select("id", "categoryIndex", "originalCategory").show()







网友评论