spark 评论标签生成

作者: 良人与我 | 来源:发表于2019-05-07 21:37 被阅读1次

在我们使用的app中,大部分都有评论功能(特别是电商和社交类的软件)。
有了评论,肯定要有分析评论的功能。
例如 一家饭店,被很多食客评论。最终商家想看到 大多数人的感受是什么?
例如 (好吃 100) - 评价好吃的有100 次
(环境不错 200)(经济实惠 99)
之类的。

现在我们就通过spark 来实现此功能
数据源是 已经经过提取后的文档,格式如下
上家 id 和 评论数据

86913510    {"reviewPics":null,"extInfoList":null,"expenseList":null,"reviewIndexes":[2],"scoreList":[{"score":5,"title":"口味","desc":""},{"score":5,"title":"服务","desc":""},{"score":5,"title":"环境","desc":""}]}
86913510    {"reviewPics":null,"extInfoList":null,"expenseList":null,"reviewIndexes":[2],"scoreList":[{"score":5,"title":"环境","desc":""},{"score":5,"title":"服务","desc":""},{"score":5,"title":"口味","desc":""}]}
86913510    {"reviewPics":null,"extInfoList":null,"expenseList":null,"reviewIndexes":[1,2],"scoreList":[{"score":5,"title":"环境","desc":""},{"score":5,"title":"服务","desc":""},{"score":5,"title":"口味","desc":""}]}
86913510    {"reviewPics":null,"extInfoList":null,"expenseList":null,"reviewIndexes":[2],"scoreList":null}

程序代码

package com.river.tag;

import com.google.gson.Gson;
import com.river.tag.dto.ReviewVo;
import org.apache.commons.collections.CollectionUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import scala.Tuple2;
import scala.Tuple3;

import java.util.stream.Collectors;

public class TagSparkDemo {

    public static void main(String[] args) {

        SparkConf conf = new SparkConf()
                .setAppName("MapReduceActionDemon")
                .setMaster("local[2]");
        JavaSparkContext sc = new JavaSparkContext(conf);
        String filePaht = "/Users/riverfan/mytest/spark/tag/temptags.txt";
        //String filePaht = "E:/myTest/tag/temptags.txt";
        sc.textFile(filePaht)
                .map(t-> t.split("\t")).filter(t->t.length == 2)
                .mapToPair(t-> {
                    ReviewVo reviewVo = new Gson().fromJson(t[1],ReviewVo.class);
                    return new Tuple2<>(t[0],reviewVo);
                })
                .filter(t-> !CollectionUtils.isEmpty(t._2.getExtInfoList()))
                .map(t-> t._2.getExtInfoList().stream()
                        .filter(i->"contentTags".equals(i.getTitle()))
                        .map(i->i.getValues())
                        .flatMap(i->i.stream())
                        .map(i->new Tuple2<>(t._1,i)).collect(Collectors.toList()))
                .flatMap(t->t.stream().iterator())
                .mapToPair(t->new Tuple2<>(t._1 + "-" + t._2 , 1))
                .reduceByKey((t1,t2)-> t1+t2)
                .map(t-> {
                    System.out.println(t._1.split("-")[0]);
                    System.out.println(t._1.split("-")[1]);
                    System.out.println(t._1);
                    return new Tuple3<>(t._2,t._1.split("-")[1],t._1.split("-")[0]);
                })
                .sortBy((Function<Tuple3<Integer, String, String>, Object>) v1 -> v1._1(),false,1)
                .groupBy(Tuple3::_3)
                .foreach(t-> System.out.println(t._1() + " " + t._2()));
    }

}

算法如下:
1.将json 数据映射为类
2.提取出来 商家id 和 评价内容(数组)
3.转化为 商家id 和 评价内容(压扁数据)
4.以 商家id-评价内容 为key 计算同一评价的个数,并按照个数的 多少 排序
5.转化 为 商家[(评论数 ,评论内容)]

程序执行结果

73879078 [(3,饮品赞,73879078), (2,味道赞,73879078), (2,服务热情,73879078), (2,回头客,73879078), (2,价格实惠,73879078), (1,性价比高,73879078), (1,情侣约会,73879078), (1,分量足,73879078)]
88284865 [(1,无推销,88284865), (1,服务热情,88284865), (1,停车不便,88284865), (1,有异味,88284865), (1,性价比低,88284865), (1,价格高,88284865), (1,价格实惠,88284865), (1,停车收费,88284865)]
83644298 [(1,价格实惠,83644298), (1,性价比高,83644298), (1,体验好,83644298), (1,味道赞,83644298), (1,服务热情,83644298)]
82317795 [(1,味道差,82317795)]
77705462 [(3,服务热情,77705462), (2,羊肉,77705462), (2,环境优雅,77705462), (2,价格实惠,77705462), (1,肉类好,77705462), (1,干净卫生,77705462), (1,味道赞,77705462), (1,羊蝎子,77705462), (1,回头客,77705462)]
83073343 [(17,干净卫生,83073343), (16,味道赞,83073343), (15,环境优雅,83073343), (11,服务热情,83073343), (11,菜品不错,83073343), (9,肉类好,83073343), (8,性价比高,83073343), (7,体验好,83073343), (7,分量足,83073343), (6,回头客,83073343), (4,价格实惠,83073343), (1,味道一般,83073343), (1,分量少,83073343), (1,上菜慢,83073343), (1,服务差,83073343)]
76114040 [(1,性价比高,76114040)]
85766086 [(2,价格实惠,85766086), (2,味道赞,85766086), (2,服务热情,85766086), (2,干净卫生,85766086), (1,菜品不错,85766086), (1,环境优雅,85766086), (1,上菜慢,85766086), (1,体验好,85766086), (1,服务差,85766086), (1,性价比高,85766086)]
86913510 [(1,分量适中,86913510), (1,午餐,86913510)]
74145782 [(18,服务热情,74145782), (14,味道赞,74145782), (13,上菜快,74145782), (13,干净卫生,74145782), (12,菜品不错,74145782), (11,环境优雅,74145782), (11,分量足,74145782), (11,回头客,74145782), (6,性价比高,74145782), (5,停车方便,74145782), (4,体验好,74145782), (3,不推荐,74145782), (2,菜品差,74145782), (2,服务差,74145782), (1,价格实惠,74145782), (1,体验差,74145782), (1,分量少,74145782)]
78477325 [(8,味道赞,78477325), (7,回头客,78477325), (5,干净卫生,78477325), (4,味道一般,78477325), (4,性价比高,78477325), (4,价格实惠,78477325), (3,菜品不错,78477325), (2,价格适中,78477325), (2,服务热情,78477325), (2,环境优雅,78477325), (1,性价比一般,78477325), (1,味道差,78477325), (1,不推荐,78477325), (1,肉类好,78477325), (1,上菜快,78477325)]
71039150 [(1,环境优雅,71039150), (1,价格实惠,71039150), (1,体验好,71039150), (1,朋友聚会,71039150), (1,团建,71039150)]
88496862 [(5,回头客,88496862), (4,味道赞,88496862), (4,服务热情,88496862), (3,分量足,88496862), (2,性价比高,88496862), (1,高大上,88496862), (1,价格实惠,88496862)]
83981222 [(4,性价比高,83981222), (3,干净卫生,83981222), (3,菜品不错,83981222), (3,价格实惠,83981222), (2,服务热情,83981222), (2,味道赞,83981222), (2,分量足,83981222), (2,环境优雅,83981222)]
70611801 [(4,干净卫生,70611801), (3,回头客,70611801), (2,环境优雅,70611801), (2,味道赞,70611801), (2,分量足,70611801), (1,价格实惠,70611801), (1,肉类好,70611801), (1,服务热情,70611801), (1,性价比高,70611801)]
73963176 [(15,味道赞,73963176), (12,价格实惠,73963176), (11,分量足,73963176), (10,菜品不错,73963176), (7,肉类好,73963176), (7,干净卫生,73963176), (7,服务热情,73963176), (6,环境优雅,73963176), (4,回头客,73963176), (3,性价比高,73963176), (1,价格高,73963176), (1,服务差,73963176), (1,味道一般,73963176)]
82705919 [(3,回头客,82705919), (3,干净卫生,82705919), (2,饮品赞,82705919), (2,点心好,82705919), (1,味道赞,82705919), (1,性价比高,82705919)]
84270191 [(2,价格实惠,84270191), (2,性价比高,84270191), (2,服务热情,84270191), (2,体验好,84270191), (1,环境优雅,84270191), (1,干净卫生,84270191), (1,大夫赞,84270191), (1,体验舒服,84270191)]
89223651 [(8,环境优雅,89223651), (8,服务热情,89223651), (7,技师专业,89223651), (5,干净卫生,89223651), (4,价格实惠,89223651), (4,服务一般,89223651), (4,无推销,89223651), (3,无办卡,89223651), (2,环境一般,89223651), (2,体验好,89223651)]
87994574 [(12,无推销,87994574), (8,价格实惠,87994574), (7,服务热情,87994574), (7,干净卫生,87994574), (5,效果赞,87994574), (5,无办卡,87994574), (4,环境优雅,87994574), (3,技师专业,87994574), (2,体验好,87994574), (2,没有异味,87994574), (1,效果差,87994574), (1,韩系风格,87994574)]
82016443 [(3,分量足,82016443), (2,环境优雅,82016443), (2,主食赞,82016443), (2,干净卫生,82016443), (2,味道赞,82016443), (1,价格实惠,82016443), (1,肉类好,82016443), (1,服务热情,82016443)]
77373671 [(1,干净卫生,77373671), (1,服务热情,77373671), (1,分量少,77373671), (1,菜品差,77373671), (1,价格实惠,77373671)]
77287793 [(29,干净卫生,77287793), (26,音响效果好,77287793), (26,环境优雅,77287793), (25,价格实惠,77287793), (25,交通便利,77287793), (19,性价比高,77287793), (18,服务热情,77287793), (16,高大上,77287793), (13,停车方便,77287793), (13,体验好,77287793), (1,音响效果差,77287793), (1,服务一般,77287793), (1,服务差,77287793), (1,朋友聚会,77287793)]
75144086 [(38,服务热情,75144086), (30,效果赞,75144086), (22,环境优雅,75144086), (22,无办卡,75144086), (21,性价比高,75144086), (21,技师专业,75144086), (19,无推销,75144086), (18,价格实惠,75144086), (13,干净卫生,75144086), (12,体验好,75144086), (10,韩系风格,75144086), (3,美发师手艺好,75144086), (2,服务差,75144086), (1,效果差,75144086), (1,美发效果好,75144086)]
79197522 [(2,服务热情,79197522), (1,价格实惠,79197522), (1,技师专业,79197522), (1,放松舒服,79197522), (1,体验舒服,79197522), (1,干净卫生,79197522)]
85648235 [(17,味道赞,85648235), (15,服务热情,85648235), (13,干净卫生,85648235), (12,上菜快,85648235), (11,回头客,85648235), (10,性价比高,85648235), (9,体验好,85648235), (8,环境优雅,85648235), (8,价格实惠,85648235), (7,分量足,85648235), (1,情侣约会,85648235)]
83084036 [(1,价格实惠,83084036), (1,干净卫生,83084036)]
73607905 [(16,菜品不错,73607905), (15,干净卫生,73607905), (15,回头客,73607905), (14,服务热情,73607905), (14,味道赞,73607905), (13,分量足,73607905), (11,性价比高,73607905), (11,肉类好,73607905), (7,环境优雅,73607905), (5,体验好,73607905), (5,上菜快,73607905), (2,体验差,73607905), (1,羊蝎子,73607905), (1,价格实惠,73607905)]
78824187 [(13,价格实惠,78824187), (11,回头客,78824187), (10,分量足,78824187), (8,性价比高,78824187), (8,环境优雅,78824187), (7,干净卫生,78824187), (6,上菜快,78824187), (5,主食赞,78824187), (5,服务热情,78824187), (4,味道赞,78824187), (1,服务差,78824187)]
76893145 [(10,服务热情,76893145), (7,环境优雅,76893145), (5,味道赞,76893145), (5,咖啡厅,76893145), (5,高大上,76893145), (4,回头客,76893145), (3,温馨浪漫,76893145), (2,服务一般,76893145), (2,体验好,76893145), (2,味道一般,76893145), (1,性价比高,76893145), (1,饮品赞,76893145)]

程序 github 地址
https://github.com/riverfrank/big-data-demon

相关文章

网友评论

    本文标题:spark 评论标签生成

    本文链接:https://www.haomeiwen.com/subject/ioyeoqtx.html