产品概述
产品优势
应用场景



from pyspark.sql import SparkSessionfrom sklearn import datasetsfrom sklearn.neighbors import KNeighborsClassifierimport mlflowfrom mlflow.models import infer_signaturespark = SparkSession.builder.getOrCreate()#加载数据集#方法一:通过机器学习库加载数据集X, y = datasets.load_iris(as_frame=True, return_X_y=True)#方法二:通过tencentcloud-dlc-connector加载DLC中数据#安装驱动!pip install tencentcloud-dlc-connector!pip install --upgrade 'sqlalchemy<2.0'#安装版本!pip install --upgrade pandas==2.2.3!pip install numpy!pip install matplotlibimport pandas as pdimport numpy as npimport tdlc_connectorfrom tdlc_connector import constantsmlflow.sklearn.autolog()#使用 tdlc-connector 按照表方式访问conn = tdlc_connector.connect(region="ap-***", #填入正确地址,如ap-Singapore,ap-Shanghaisecret_id="*******",secret_key="*******",engine="your engine",#填入购买的引擎名称resource_group=None,engine_type=constants.EngineType.AUTO,result_style=constants.ResultStyles.LIST,download=True)query = """SELECT `sepal.length`, `sepal.width`,`petal.length`,`petal.width`,species FROM at_database_testnotebook.demo_test_sklearn"""iris = pd.read_sql(query, conn)spark_iris = spark.createDataFrame(iris)#划分特征列与目标列feature_cols = ["sepal_length", "sepal_width", "petal_length", "petal_width"]X = spark_iris.select(feature_cols)X = spark_iris.select(feature_cols)y = spark_iris.select("species")#使用K近邻算法进行分类model = KNeighborsClassifier()model.fit(X, y)predictions = model.predict(X)signature = infer_signature(X, predictions)with mlflow.start_run():model_info = mlflow.sklearn.log_model(model, artifact_path="model", signature=signature)infer_spark_df = spark.createDataFrame(X)pyfunc_udf = mlflow.pyfunc.spark_udf(spark, model_info.model_uri)result = infer_spark_df.select(pyfunc_udf(*X.columns).alias("predictions")).toPandas()print(result)


文档反馈