Author: Ju4t
from pyspark import SparkConf, SparkContext
def wordcount(input="dataset/word.txt"):
"""
wordCount
:param input:
:return:
"""
# lines = sc.textFile(input)
# word = lines.flatMap(lambda line: line.split(" "))
# wordToOne = word.map(lambda words: (words, 1))
# wordGroup = wordToOne.reduceByKey(lambda x, y: x + y)
# res = wordGroup.collect()
res = sc.textFile(input) \
.flatMap(lambda line: line.split(" ")) \
.map(lambda words: (words, 1)) \
.reduceByKey(lambda x, y: x + y) \
.collect()
for word, count in res:
print(word, count)
def behavior(input="dataset/UserBehavior.min.csv"):
"""
:param input:
:return:
"""
# filter 过滤 长度不等于5 的内容
rdd = sc.textFile(input).map(lambda line: line.split(",")).filter(lambda x: len(x) == 5)
# 通知用户数
total_user = rdd.map(lambda x: x[0]).distinct().count()
bhv_count = rdd.map(lambda x: (x[3], 1)).reduceByKey(lambda x, y: x + y).collect()
print({
'total_user': total_user,
'act_count': bhv_count
})
if __name__ == '__main__':
sparkConf = SparkConf().setMaster("local").setAppName("WordCount")
sc = SparkContext(conf=sparkConf)
behavior(input='dataset/UserBehavior.csv')
sc.stop()
Hello Scala
Hello Spark
Hello BigData
column1 | column2 | column3 | column2 | column3 |
---|---|---|---|---|
1 | 2268318 | 2520377 | pv | 1511544070 |
1 | 2333346 | 2520771 | pv | 1511561733 |
1 | 2576651 | 149192 | pv | 1511572885 |
1 | 4365585 | 2520377 | pv | |
1 | 4606018 | 2735466 | pv |
淘宝用户购物行为数据集 https://tianchi.aliyun.com/dataset/dataDetail?dataId=649 (905.80MB / 3.67G)