Spark wordcount Python版

Author: Ju4t

main.py

from pyspark import SparkConf, SparkContext


def wordcount(input="dataset/word.txt"):
    """
    wordCount
    :param input:
    :return:
    """
    # lines = sc.textFile(input)
    # word = lines.flatMap(lambda line: line.split(" "))
    # wordToOne = word.map(lambda words: (words, 1))
    # wordGroup = wordToOne.reduceByKey(lambda x, y: x + y)
    # res = wordGroup.collect()

    res = sc.textFile(input) \
        .flatMap(lambda line: line.split(" ")) \
        .map(lambda words: (words, 1)) \
        .reduceByKey(lambda x, y: x + y) \
        .collect()
    for word, count in res:
        print(word, count)


def behavior(input="dataset/UserBehavior.min.csv"):
    """
    :param input:
    :return:
    """
    # filter 过滤 长度不等于5 的内容
    rdd = sc.textFile(input).map(lambda line: line.split(",")).filter(lambda x: len(x) == 5)

    # 通知用户数
    total_user = rdd.map(lambda x: x[0]).distinct().count()
    bhv_count = rdd.map(lambda x: (x[3], 1)).reduceByKey(lambda x, y: x + y).collect()

    print({
        'total_user': total_user,
        'act_count': bhv_count
    })


if __name__ == '__main__':
    sparkConf = SparkConf().setMaster("local").setAppName("WordCount")
    sc = SparkContext(conf=sparkConf)
    behavior(input='dataset/UserBehavior.csv')
    sc.stop()

数据示例

word.txt

Hello Scala
Hello Spark
Hello BigData

UserBehavior.csv

column1 column2 column3 column2 column3
1 2268318 2520377 pv 1511544070
1 2333346 2520771 pv 1511561733
1 2576651 149192 pv 1511572885
1 4365585 2520377 pv
1 4606018 2735466 pv

淘宝用户购物行为数据集 https://tianchi.aliyun.com/dataset/dataDetail?dataId=649 (905.80MB / 3.67G)