diff --git a/frameworks/MLflow/3.10.1/Dockerfile b/frameworks/MLflow/3.10.1/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..d997d51dde40575910d00994b613b66321c4b5cd --- /dev/null +++ b/frameworks/MLflow/3.10.1/Dockerfile @@ -0,0 +1,25 @@ +FROM opencloudos/opencloudos9-minimal:latest + +LABEL maintainer="stronking 363133710@qq.com" +LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container" +LABEL org.opencontainers.image.description="MLflow 3.10.1 container image based on OpenCloudOS 9" + +ARG MLFLOW_VERSION=3.10.1 + +ENV LANG=en_US.UTF-8 +ENV LC_ALL=en_US.UTF-8 +ENV PYTHONUNBUFFERED=1 + +# Install MLflow +RUN python3 -m ensurepip +RUN --mount=type=cache,id=pip-cache-opencloudos9-cu128,target=/root/.cache/pip pip3 install mlflow==$MLFLOW_VERSION + +# Default MLflow configuration +ENV MLFLOW_HOST=0.0.0.0 +ENV MLFLOW_PORT=5000 + +EXPOSE 5000 + +WORKDIR /workspace + +CMD ["sh", "-c", "mlflow server --host ${MLFLOW_HOST} --port ${MLFLOW_PORT}"] diff --git a/frameworks/MLflow/3.10.1/README.md b/frameworks/MLflow/3.10.1/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f0711c8bc61b35de50c8d618c0bc72ebc84c8dfb --- /dev/null +++ b/frameworks/MLflow/3.10.1/README.md @@ -0,0 +1,189 @@ + +# MLflow on OpenCloudOS 9 + +## 基本信息 + +- **框架版本**:v3.10.1 +- **基础镜像**:opencloudos9-minimal +- **Python 版本**:3.11 +- **CUDA 版本**:N/A + +--- + +## 项目简介 + +[MLflow](https://github.com/mlflow/mlflow) 是一个开源的机器学习生命周期管理平台,主要提供以下能力: + +- 实验追踪(Experiment Tracking) +- 参数与指标记录 +- Artifact 管理 +- 模型注册(Model Registry) +- 模型部署(Serving) + +本镜像基于 OpenCloudOS 9 构建,提供轻量级 MLflow Tracking Server 运行环境。 + +--- + +## 构建 + +```bash +docker build -t oc9-mlflow:3.10.1 . +```` + +--- + +## 使用示例 + +### 查看 MLflow 版本 + +```bash +docker run --rm oc9-mlflow:3.10.1 \ + python3 -c "import mlflow; print(mlflow.__version__)" +``` + +--- + +## 启动 MLflow Tracking Server + +```bash +docker run -d \ + --name mlflow \ + -p 5000:5000 \ + oc9-mlflow:3.10.1 +``` + +访问: + +```text +http://localhost:5000 +``` + +--- + +## 持久化实验数据 + +默认情况下,容器中的实验数据会随着容器删除而丢失。 + +推荐挂载数据目录: + +```bash +docker run -d \ + --name mlflow \ + -p 5000:5000 \ + -v $(pwd)/mlruns:/mlruns \ + oc9-mlflow:3.10.1 \ + sh -c "mlflow server \ + --backend-store-uri sqlite:///mlruns/mlflow.db \ + --default-artifact-root /mlruns \ + --host 0.0.0.0 \ + --port 5000" +``` + +目录说明: + +```text +mlruns/ +├── mlflow.db +└── artifacts +``` + +--- + +## 实验追踪示例 + +创建示例脚本: + +```python +import mlflow +import random + +mlflow.set_tracking_uri("http://127.0.0.1:5000") +mlflow.set_experiment("demo-experiment") + +with mlflow.start_run(): + + mlflow.log_param("learning_rate", 0.01) + mlflow.log_param("epochs", 10) + + for step in range(10): + loss = 1.0 / (step + 1) + accuracy = 0.8 + random.random() * 0.1 + + mlflow.log_metric("loss", loss, step=step) + mlflow.log_metric("accuracy", accuracy, step=step) + + with open("result.txt", "w") as f: + f.write("training completed") + + mlflow.log_artifact("result.txt") +``` + +运行: + +```bash +python3 train.py +``` + +然后访问: + +```text +http://localhost:5000 +``` + +即可查看实验参数、指标和 Artifact。 + +--- + +## 容器网络使用示例 + +如果训练任务运行在其他容器中,建议使用 Docker Network。 + +创建网络: + +```bash +docker network create mlflow-net +``` + +启动 MLflow: + +```bash +docker run -d \ + --name mlflow \ + --network mlflow-net \ + -p 5000:5000 \ + oc9-mlflow:3.10.1 +``` + +训练容器中配置: + +```bash +export MLFLOW_TRACKING_URI=http://mlflow:5000 +``` + +--- + +## 默认配置 + +| 配置项 | 默认值 | +| ----------------- | ---------- | +| Host | 0.0.0.0 | +| Port | 5000 | +| Working Directory | /workspace | + +--- + +## 已知问题 + +* 当前镜像为轻量版,不包含 PyTorch、TensorFlow 等深度学习框架 +* 不包含 CUDA 与 GPU 运行环境 +* Model Serving 场景下,部分依赖需用户自行安装 + +--- + +## 上游项目 + +* MLflow: https://github.com/mlflow/mlflow +* OpenCloudOS: https://gitee.com/OpenCloudOS + +``` +``` diff --git a/frameworks/MLflow/3.10.1/build.conf b/frameworks/MLflow/3.10.1/build.conf new file mode 100644 index 0000000000000000000000000000000000000000..baaea75b532f11438fad7a26353f8747c7547a94 --- /dev/null +++ b/frameworks/MLflow/3.10.1/build.conf @@ -0,0 +1,4 @@ +# MLflow 3.10.1 on OpenCloudOS 9 +IMAGE_NAME=oc9-mlflow +IMAGE_TAG=3.10.1 +GPU_TEST=false \ No newline at end of file diff --git a/frameworks/MLflow/3.10.1/test.sh b/frameworks/MLflow/3.10.1/test.sh new file mode 100644 index 0000000000000000000000000000000000000000..1cb9a5c4f2d347be491716445d15ce75375f157d --- /dev/null +++ b/frameworks/MLflow/3.10.1/test.sh @@ -0,0 +1,77 @@ +#!/bin/bash +set -e + +IMAGE="${1:?ERROR: 缺少镜像参数。用法: bash test.sh }" + +DOCKER_CMD="docker run --rm -e GIT_PYTHON_REFRESH=quiet" + +echo "=== MLflow 基础功能测试 ===" + +echo -n "检查 MLflow import 和版本... " +$DOCKER_CMD "$IMAGE" python3 -c "import mlflow; print(mlflow.__version__)" >/tmp/mlflow_import.log 2>&1 \ + && echo "✓ 通过" || { echo "✗ 失败"; cat /tmp/mlflow_import.log; exit 1; } + +echo -n "检查 MLflow CLI... " +$DOCKER_CMD "$IMAGE" mlflow --version >/tmp/mlflow_cli.log 2>&1 \ + && echo "✓ 通过" || { echo "✗ 失败"; cat /tmp/mlflow_cli.log; exit 1; } + +echo -n "检查实验追踪核心功能... " +$DOCKER_CMD "$IMAGE" python3 -c " +import os +import tempfile +import mlflow + +tmpdir = tempfile.mkdtemp() +db_path = os.path.join(tmpdir, 'mlflow.db') + +mlflow.set_tracking_uri('sqlite:///' + db_path) +mlflow.set_experiment('ci-test-experiment') + +with mlflow.start_run(): + mlflow.log_param('learning_rate', 0.01) + mlflow.log_metric('accuracy', 0.95) + + run = mlflow.active_run() + assert run is not None + assert run.info.run_id is not None + +print('MLflow experiment tracking works') +" >/tmp/mlflow_tracking.log 2>&1 \ + && echo "✓ 通过" || { echo "✗ 失败"; cat /tmp/mlflow_tracking.log; exit 1; } + +echo -n "检查 MLflow Tracking Server 启动... " +$DOCKER_CMD "$IMAGE" bash -c " +set -e + +TMPDIR=\$(mktemp -d) + +mlflow server \ + --backend-store-uri sqlite:///\$TMPDIR/mlflow.db \ + --default-artifact-root \$TMPDIR/artifacts \ + --host 127.0.0.1 \ + --port 5000 >/tmp/mlflow_server.log 2>&1 & + +PID=\$! + +for i in \$(seq 1 60); do + if curl -fs http://127.0.0.1:5000/health >/dev/null 2>&1; then + kill \$PID + wait \$PID 2>/dev/null || true + exit 0 + fi + + if ! kill -0 \$PID 2>/dev/null; then + cat /tmp/mlflow_server.log + exit 1 + fi + + sleep 1 +done + +cat /tmp/mlflow_server.log +kill \$PID 2>/dev/null || true +exit 1 +" >/tmp/mlflow_server_check.log 2>&1 \ + && echo "✓ 通过" || { echo "✗ 失败"; cat /tmp/mlflow_server_check.log; exit 1; } + +echo "=== 所有测试通过 ===" \ No newline at end of file diff --git a/frameworks/MLflow/3.10.1/test_result.png b/frameworks/MLflow/3.10.1/test_result.png new file mode 100644 index 0000000000000000000000000000000000000000..a5605dbb07711f3a7fc4d1fd311ef03a7dac8c26 Binary files /dev/null and b/frameworks/MLflow/3.10.1/test_result.png differ