python
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, isnull
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
python
spark = SparkSession.builder.appName("Data Cleaning").getOrCreate()
python
data = spark.read.csv("data.csv", header=True, inferSchema=True)
python
filtered_data = data.filter((col("age") > 18) & (col("gender") == "female"))
python
filled_data = data.fillna({"height": 170, "weight": data.select("weight").agg({"weight": "mean"}).first()[0]})
python
deduplicated_data = data.dropDuplicates()
python
indexer = StringIndexer(inputCol="gender", outputCol="gender_indexed")
indexed_data = indexer.fit(data).transform(data)
encoder = OneHotEncoder(inputCols=["gender_indexed"], outputCols=["gender_encoded"])
encoded_data = encoder.fit(indexed_data).transform(indexed_data)
python
assembler = VectorAssembler(inputCols=["age", "height", "weight"], outputCol="features")
feature_vector = assembler.transform(data)