python
import dpark
if __name__ == '__main__':
ctx = dpark.DparkContext()
python
python
word_count = data.flatMap(lambda line: line.split(" ")) \
.map(lambda word: (word, 1)) \
.reduceByKey(lambda a, b: a + b)
python
sorted_word_count = word_count.sortBy(lambda x: x[1], ascending=False)
filtered_word_count = word_count.filter(lambda x: x[1] >= 5)
python
sorted_word_count.saveAsTextFile("/path/to/output.txt")
conf
{
"num_cpus": 4,
"memory_limit": "10g",
"hadoop.input.format": "TextInputFormat",
"hadoop.output.format": "TextOutputFormat"
}