pip install pyspark
./sbin/start-master.sh
./sbin/start-worker.sh <master-url>
python
from pyspark.sql import SparkSession
python
spark = SparkSession.builder.appName("SparkTutorial").getOrCreate()
python
data = spark.read.csv("data.csv", header=True, inferSchema=True)
python
data.show(n)
data.select("column1", "column2")
data.filter(data["column1"] > 100)
data.groupBy("column1").agg({"column2": "mean"})
data.orderBy("column1")
data.write.csv("output.csv", header=True)
spark-submit --master <master-url> <python-file>