python from pyspark.sql import SparkSession from pyspark.sql.functions import col, isnull from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler python spark = SparkSession.builder.appName("Data Cleaning").getOrCreate() python data = spark.read.csv("data.csv", header=True, inferSchema=True) python filtered_data = data.filter((col("age") > 18) & (col("gender") == "female")) python filled_data = data.fillna({"height": 170, "weight": data.select("weight").agg({"weight": "mean"}).first()[0]}) python deduplicated_data = data.dropDuplicates() python indexer = StringIndexer(inputCol="gender", outputCol="gender_indexed") indexed_data = indexer.fit(data).transform(data) encoder = OneHotEncoder(inputCols=["gender_indexed"], outputCols=["gender_encoded"]) encoded_data = encoder.fit(indexed_data).transform(indexed_data) python assembler = VectorAssembler(inputCols=["age", "height", "weight"], outputCol="features") feature_vector = assembler.transform(data)


上一篇:
下一篇:
切换中文