from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.appName("CorrelationExample") \
.getOrCreate()
# 稀疏向量表示:Vectors.sparse(向量长度,[(index,value),(index,value)])。未列出的索引其对应值全部为0
# 其他表示形式:
# Vectors.sparse(向量长度,{idx1:val1,idx2:val2})
# Vectors.sparse(向量长度,[idx1,idx2,...],[val1,val2,...])
data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),),# 表示[1,0,0,-2]
(Vectors.dense([4.0, 5.0, 0.0, 3.0]),),
(Vectors.dense([6.0, 7.0, 0.0, 8.0]),),
(Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)]# [9,0,0,1]
df = spark.createDataFrame(data, ["features"])
# 每个元素表示左右vector列表取对应两两vector的相关系数
r1 = Correlation.corr(df, "features").head()# 默认Pearson相关系数
print("Pearson correlation matrix:\n" + str(r1[0]))
r2 = Correlation.corr(df, "features", "spearman").head()# spearman相关系数
print("Spearman correlation matrix:\n" + str(r2[0]))
网友评论