python
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName("DistributedML") \
.getOrCreate()
python
from pyspark.ml.regression import LinearRegression
lr = LinearRegression()
lrModel = lr.fit(trainData)
python
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator()
rmse = evaluator.evaluate(predictions)
python
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
paramGrid = ParamGridBuilder() \
.addGrid(lr.regParam, [0.1, 0.01]) \
.addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
.build()
crossval = CrossValidator(estimator=lr,
estimatorParamMaps=paramGrid,
evaluator=evaluator)
cvModel = crossval.fit(trainData)
python
lrModel.save("path/to/model")
python
from pyspark.ml.regression import LinearRegressionModel
lrModel = LinearRegressionModel.load("path/to/model")
predictions = lrModel.transform(testData)