add prediction only workflow

ppalmes · ppalmes · commit 3a825aca8cd5 · 2025-09-18T08:33:06.000-05:00
diff --git a/DockerizedAutoML/main.jl b/DockerizedAutoML/main.jl
@@ -7,147 +7,155 @@ using Statistics
 
 
 function parse_commandline()
-  s = ArgParseSettings()
-  @add_arg_table! s begin
-    "--url", "-u"
-    help = "mlflow server url"
-    arg_type = String
-    default = "http://localhost:8080"
-    "--prediction_type", "-t"
-    help = "classification, regression, anomalydetection"
-    arg_type = String
-    default = "classification"
-    "--complexity", "-c"
-    help = "pipeline complexity"
-    arg_type = String
-    default = "low"
-    "--output_file", "-o"
-    help = "output location"
-    arg_type = String
-    default = "NONE"
-    "--nfolds", "-f"
-    help = "number of crossvalidation folds"
-    arg_type = Int64
-    default = 3
-    "--nworkers", "-w"
-    help = "number of workers"
-    arg_type = Int64
-    default = 5
-    "--no_save"
-    help = "save model"
-    action = :store_true
-    "--predict_only"
-    help = "no training, predict only"
-    action = :store_true
-    "--runid"
-    help = "runid of experiment for trained model"
-    arg_type = String
-    default = "NONE"
-    "csvfile"
-    help = "input csv file"
-    required = true
-  end
-  return parse_args(s; as_symbols=true)
+    s = ArgParseSettings()
+    @add_arg_table! s begin
+        "--url", "-u"
+        help = "mlflow server url"
+        arg_type = String
+        default = "http://localhost:8080"
+        "--prediction_type", "-t"
+        help = "classification, regression, anomalydetection"
+        arg_type = String
+        default = "classification"
+        "--complexity", "-c"
+        help = "pipeline complexity"
+        arg_type = String
+        default = "low"
+        "--output_file", "-o"
+        help = "output location"
+        arg_type = String
+        default = "NONE"
+        "--nfolds", "-f"
+        help = "number of crossvalidation folds"
+        arg_type = Int64
+        default = 3
+        "--nworkers", "-w"
+        help = "number of workers"
+        arg_type = Int64
+        default = 5
+        "--no_save"
+        help = "save model"
+        action = :store_true
+        "--predict_only"
+        help = "no training, predict only"
+        action = :store_true
+        "--runid"
+        help = "runid of experiment for trained model"
+        arg_type = String
+        default = "NONE"
+        "csvfile"
+        help = "input csv file"
+        required = true
+    end
+    return parse_args(s; as_symbols=true)
 end
 
 const _cliargs = parse_commandline()
 const _workers = _cliargs[:nworkers]
 
 if _cliargs[:predict_only] == false
-  nprocs() == 1 && addprocs(_workers; exeflags=["--project=$(Base.active_project())"])
-  @everywhere using AutoAI
+    nprocs() == 1 && addprocs(_workers; exeflags=["--project=$(Base.active_project())"])
+    @everywhere using AutoAI
 end
 
 function autoclassmode(args::Dict)
-  url = args[:url]
-  complexity = args[:complexity]
-  nfolds = args[:nfolds]
-  nworkers = args[:nworkers]
-  prediction_type = args[:prediction_type]
-  impl_args = (; complexity, nfolds, nworkers, prediction_type) |> pairs |> Dict
-  fname = _cliargs[:csvfile]
-  df = CSV.read(fname, DataFrame)
-  X = df[:, 1:end-1]
-  Y = df[:, end] |> collect
-  autoclass = AutoMLFlowClassification(Dict(:url => url, :impl_args => impl_args))
-  Yc = fit_transform!(autoclass, X, Y)
-  println("accuracy = ", mean(Y .== Yc))
-  return autoclass
+    url = args[:url]
+    complexity = args[:complexity]
+    nfolds = args[:nfolds]
+    nworkers = args[:nworkers]
+    prediction_type = args[:prediction_type]
+    impl_args = (; complexity, nfolds, nworkers, prediction_type) |> pairs |> Dict
+    fname = _cliargs[:csvfile]
+    df = CSV.read(fname, DataFrame)
+    X = df[:, 1:end-1]
+    Y = df[:, end] |> collect
+    autoclass = AutoMLFlowClassification(Dict(:url => url, :impl_args => impl_args))
+    Yc = fit_transform!(autoclass, X, Y)
+    println("accuracy = ", mean(Y .== Yc))
+    return autoclass
 end
 
 function autoregmode(args::Dict)
-  url = args[:url]
-  complexity = args[:complexity]
-  nfolds = args[:nfolds]
-  nworkers = args[:nworkers]
-  prediction_type = args[:prediction_type]
-  impl_args = (; complexity, nfolds, nworkers, prediction_type) |> pairs |> Dict
-  fname = _cliargs[:csvfile]
-  df = CSV.read(fname, DataFrame)
-  X = df[:, 1:end-1]
-  Y = df[:, end] |> collect
-  autoreg = AutoMLFlowRegression(Dict(:url => url, :impl_args => impl_args))
-  Yc = fit_transform!(autoreg, X, Y)
-  println("mse = ", mean((Y - Yc) .^ 2))
-  return autoreg
+    url = args[:url]
+    complexity = args[:complexity]
+    nfolds = args[:nfolds]
+    nworkers = args[:nworkers]
+    prediction_type = args[:prediction_type]
+    impl_args = (; complexity, nfolds, nworkers, prediction_type) |> pairs |> Dict
+    fname = _cliargs[:csvfile]
+    df = CSV.read(fname, DataFrame)
+    X = df[:, 1:end-1]
+    Y = df[:, end] |> collect
+    autoreg = AutoMLFlowRegression(Dict(:url => url, :impl_args => impl_args))
+    Yc = fit_transform!(autoreg, X, Y)
+    println("mse = ", mean((Y - Yc) .^ 2))
+    return autoreg
 end
 
 function doprediction_only(args::Dict)
-  fname = args[:csvfile]
-  X = CSV.read(fname, DataFrame)
-  run_id = args[:runid]
-  url = args[:url]
-  mlf = AutoMLFlowClassification(Dict(:run_id => run_id, :url => url))
-  Yn = transform!(mlf, X)
-  ofile = args[:output_file]
-  if ofile != "NONE"
-    open(ofile, "w") do stfile
-      println(stfile, "prediction: $Yn")
-      println(stdout, "prediction: $Yn")
+    fname = args[:csvfile]
+    X = CSV.read(fname, DataFrame)
+    run_id = args[:runid]
+    url = args[:url]
+    mlf =
+        predtype = args[:prediction_type]
+    mlf = if predtype == "classification"
+        AutoMLFlowClassification(Dict(:run_id => run_id, :url => url))
+    elseif predtype == "regression"
+        AutoMLFlowRegression(Dict(:run_id => run_id, :url => url))
+    else
+        error("unknown predtype option")
+    end
+    Yn = transform!(mlf, X)
+    ofile = args[:output_file]
+    if ofile != "NONE"
+        open(ofile, "w") do stfile
+            println(stfile, "prediction: $Yn")
+            println(stdout, "prediction: $Yn")
+        end
+    else
+        println(stdout, "prediction: $Yn")
     end
-  else
-    println(stdout, "prediction: $Yn")
-  end
-  return Yn
+    return Yn
 end
 
 function printsummary(io::IO, automl::Workflow)
-  r(x) = round(x, digits=2)
-  trainedmodel = automl.model[:automodel]
-  bestmodel = trainedmodel.model[:bestpipeline].model[:description]
-  println(io, "pipelines: $(trainedmodel.model[:dfpipelines].Description)")
-  println(io, "best_pipeline: $bestmodel")
-  bestmean = trainedmodel.model[:performance].mean[1]
-  bestsd = trainedmodel.model[:performance].sd[1]
-  println(io, "best_pipeline_performance: $(r(bestmean)) ± $(r(bestsd))")
+    r(x) = round(x, digits=2)
+    trainedmodel = automl.model[:automodel]
+    bestmodel = trainedmodel.model[:bestpipeline].model[:description]
+    println(io, "pipelines: $(trainedmodel.model[:dfpipelines].Description)")
+    println(io, "best_pipeline: $bestmodel")
+    bestmean = trainedmodel.model[:performance].mean[1]
+    bestsd = trainedmodel.model[:performance].sd[1]
+    println(io, "best_pipeline_performance: $(r(bestmean)) ± $(r(bestsd))")
 end
 
 function dotrainandpredict(args::Dict)
-  # train model
-  predtype = args[:prediction_type]
-  automl = if predtype == "classification"
-    autoclassmode(args)
-  elseif predtype == "regression"
-    autoregmode(args)
-  end
-  ofile = args[:output_file]
-  if ofile != "NONE"
-    open(ofile, "w") do stfile
-      printsummary(stfile, automl)
-      printsummary(stdout, automl)
+    # train model
+    predtype = args[:prediction_type]
+    automl = if predtype == "classification"
+        autoclassmode(args)
+    elseif predtype == "regression"
+        autoregmode(args)
+    end
+    ofile = args[:output_file]
+    if ofile != "NONE"
+        open(ofile, "w") do stfile
+            printsummary(stfile, automl)
+            printsummary(stdout, automl)
+        end
+    else
+        printsummary(stdout, automl)
     end
-  else
-    printsummary(stdout, automl)
-  end
 end
 
 function main(args::Dict)
-  if args[:predict_only] == true
-    # predict only using run_id of model in the artifact
-    doprediction_only(args)
-  else
-    # train and predict
-    dotrainandpredict(args)
-  end
+    if args[:predict_only] == true
+        # predict only using run_id of model in the artifact
+        doprediction_only(args)
+    else
+        # train and predict
+        dotrainandpredict(args)
+    end
 end
 main(_cliargs)
diff --git a/DockerizedAutoML/run.sh b/DockerizedAutoML/run.sh
@@ -1,9 +1,16 @@
-docker build -t automlai --platform=linux/amd64 .
-docker run -it --rm --platform=linux/amd64 automlai
+docker build -t automlai:v2.0 --platform=linux/amd64 .
+docker run -it --rm --platform=linux/amd64 automlai:v2.0
 
 # julia --project -- ./main.jl -c high -t regression -f 3 -w 7 iris_reg.csv
 # julia --project -- ./main.jl -c low -t classification -f 3 -w 3 iris.csv
 # julia --project -- ./main.jl -c low -t anomalydetection iris.csv
 # podman run -it --rm --platform=linux/amd64 localhost/automlai -u http://spendor2.sl.cloud9.ibm.com:30412 iris.csv
 # podman run -it --rm -v `pwd`:/data/  localhost/automlai -u http://spendor2.sl.cloud9.ibm.com:30412 -t regression /data/iris_reg.csv
 # julia --project -- ./main.jl -c low -t classification -f 3 -w 3 iris.csv --predict_only --runid cd4e463d6a414aa4aaad173e567d7d22 -o /tmp/hello.txt
+
+julia --project -- ./main.jl  -t regression --predict_only -u http://mlflow.isiath.duckdns.org:8082 --runid 064fb7a188d34a3da87f2271b8d8d9c2 -o /tmp/reg.txt ./iris_reg.csv
+julia --project -- ./main.jl -u http://mlflow.isiath.duckdns.org:8082 -t classification --predict_only --runid e33bbd5c12a54756b1333df1f23a8366 -o /tmp/class.txt ./iris.csv
+
+docker run -it --rm -v `pwd`:/data/  localhost/automlai -u http://mlflow.isiath.duckdns.org:8082 -t classification --predict_only --runid e33bbd5c12a54756b1333df1f23a8366 /data/iris.csv
+
+docker run -it --rm -v `pwd`:/data/  localhost/automlai -u http://mlflow.isiath.duckdns.org:8082 -t regression --predict_only --runid 064fb7a188d34a3da87f2271b8d8d9c2 /data/iris_reg.csv