diff --git a/frameworks/tensorrt-llm/1.2.0/Dockerfile b/frameworks/tensorrt-llm/1.2.0/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..5c8804654db6a2d19361edc1a83bb12014ebb7ff --- /dev/null +++ b/frameworks/tensorrt-llm/1.2.0/Dockerfile @@ -0,0 +1,43 @@ +FROM opencloudos/opencloudos9-minimal:latest + +LABEL maintainer="stronking 363133710@qq.com" +LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container" +LABEL org.opencontainers.image.description="TensorRT-LLM on OpenCloudOS 9 with Python 3.12, CUDA 13.0" +ENV NVIDIA_VISIBLE_DEVICES=all +ARG TENSORRTLLM_VERSION=1.2.0 +RUN dnf install -y epol-release \ + && dnf makecache \ + && dnf upgrade -y \ + && dnf clean all + +RUN dnf install -y \ + cuda-toolkit-13-0 \ + cuda-devel-13-0 \ + libcudnn9-cuda-13 \ + libcudnn9-devel-cuda-13 \ + libcudnn9-headers-cuda-13 \ + libcudnn9-static-cuda-13 \ + libnccl-cuda-13-0 \ + libnccl-devel-cuda-13-0 \ + libnccl-static-cuda-13-0 \ + nccl-cuda-13-0 \ + libcublasmp0-cuda-13 \ + libcublasmp0-devel-cuda-13 \ + && dnf clean all \ + && rm -rf /var/cache/yum/* + + +RUN dnf install -y python3.12 python3.12-devel openmpi openmpi-devel +RUN python3.12 -m ensurepip && \ + python3.12 -m venv /home/python3.12 && \ + /home/python3.12/bin/pip install --upgrade pip setuptools wheel packaging build +RUN /home/python3.12/bin/pip install tensorrt-llm==$TENSORRTLLM_VERSION --extra-index-url https://pypi.nvidia.com +ENV LD_LIBRARY_PATH=/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH +ENV PATH=/usr/local/cuda/bin:$PATH +ENV PYTHONUNBUFFERED=1 +RUN printf 'import platform as _p\n_o=_p.python_version\n_p.python_version=lambda:_o().rstrip("+")\n' > /home/python3.12/lib/python3.12/site-packages/sitecustomize.py +RUN echo 'source /home/python3.12/bin/activate' >> /root/.bashrc + +RUN date +"%Y/%m/%d" > /etc/buildinfo + +CMD ["/home/python3.12/bin/python"] \ No newline at end of file diff --git a/frameworks/tensorrt-llm/1.2.0/README.md b/frameworks/tensorrt-llm/1.2.0/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9757072595a0b9f40f60eeebcc96e10450edddf0 --- /dev/null +++ b/frameworks/tensorrt-llm/1.2.0/README.md @@ -0,0 +1,93 @@ +# TensorRT-LLM 1.2.0 on OpenCloudOS 9 + +## 基本信息 + +- **框架版本**:TensorRT-LLM v1.2.0 +- **基础镜像**:opencloudos/opencloudos9-minimal:latest +- **Python 版本**:Python 3.12 +- **CUDA 版本**:CUDA 13.0 或更高 +- **GPU 支持**:需要 NVIDIA GPU 与 NVIDIA Container Toolkit +- **MPI 支持**:已集成 OpenMPI / mpi4py +- **主要用途**: + - TensorRT-LLM Python 环境验证 + - CUDA / PyTorch GPU 功能测试 + - TensorRT-LLM 推理环境基础镜像 + - 后续模型构建、Engine 转换与推理测试 + +## 构建 + +在 Dockerfile 所在目录执行: + +```bash +docker build -t oc9-tensorrt-llm:1.2.0 . +``` + +## 镜像启动命令 + +#### 后台启动容器: +```bash +docker run -d \ + --gpus all \ + --name oc9-tensorrt-llm \ + oc9-tensorrt-llm:1.2.0 \ + sleep infinity +``` +#### 进入容器 +```bash +docker exec -it oc9-tensorrt-llm bash +``` + +## 测试镜像 + +#### 在容器内执行基础验证 +```bash +docker run --rm --gpus all oc9-tensorrt-llm:1.2.0 bash -lc " +nvidia-smi && +python -c 'import torch; print(torch.cuda.is_available()); print(torch.cuda.get_device_name(0))' && +python -c 'import tensorrt_llm; print(tensorrt_llm.__version__)' && +python -c 'from mpi4py import MPI; print(MPI.Get_version())' +" +``` + +## 常见问题 + + + +## 验证内容 + +## 注意事项 + +1. 启动容器时必须添加: + +```bash +--gpus all +``` + +否则容器内无法访问 NVIDIA GPU。 + +2. 宿主机需要安装 NVIDIA Container Toolkit。 + +3. 当前测试只验证基础运行环境,不包含完整模型推理流程。 + +4. TensorRT-LLM 真正推理通常需要额外步骤: + +```text +下载模型 → 转换 checkpoint → 构建 TensorRT engine → 运行推理脚本 +``` + + +## 常见问题 + +### 1. ModuleNotFoundError: No module named 'torch' + +通常是 Python 虚拟环境没有进入默认 PATH。需要确认镜像中包含: + +```dockerfile +ENV VIRTUAL_ENV=/home/python3.12 +ENV PATH=/home/python3.12/bin:$PATH +``` +或者需要确认虚拟环境是否被激活 +```bash +source /home/python3.12/bin/activate +``` + diff --git a/frameworks/tensorrt-llm/1.2.0/build.conf b/frameworks/tensorrt-llm/1.2.0/build.conf new file mode 100644 index 0000000000000000000000000000000000000000..5eaab65ea241d4a2f8d87300a073f9cf8b5c7365 --- /dev/null +++ b/frameworks/tensorrt-llm/1.2.0/build.conf @@ -0,0 +1,5 @@ +# TnesorRT-LLM 1.2.0 on OpenCloudOS 9 (GPU) +IMAGE_NAME=oc9-tensorrt-llm + +IMAGE_TAG=1.2.0 +GPU_TEST=false \ No newline at end of file diff --git a/frameworks/tensorrt-llm/1.2.0/test.sh b/frameworks/tensorrt-llm/1.2.0/test.sh new file mode 100644 index 0000000000000000000000000000000000000000..973d808943086e4db0c8c2caa67a7ef59f0867aa --- /dev/null +++ b/frameworks/tensorrt-llm/1.2.0/test.sh @@ -0,0 +1,40 @@ +#!/bin/bash +set -euo pipefail + +IMAGE="${1:?用法: bash test.sh }" + +echo "=== TensorRT-LLM GPU 镜像功能测试 ===" + +echo -n "检查 NVIDIA GPU 驱动 & CUDA... " +docker run --rm --gpus all "$IMAGE" bash -lc ' +nvidia-smi >/dev/null +nvcc --version >/dev/null +/home/python3.12/bin/python -c "import torch; assert torch.cuda.is_available(); print(\"GPU:\", torch.cuda.get_device_name(0))" +' +echo "✓ 通过" + +echo -n "检查 tensorrt_llm import... " +docker run --rm --gpus all "$IMAGE" bash -lc ' +/home/python3.12/bin/python -c "import tensorrt_llm; print(\"tensorrt_llm version:\", tensorrt_llm.__version__)" +' +echo "✓ 通过" + +echo -n "检查 torch CUDA 张量计算... " +docker run --rm --gpus all "$IMAGE" bash -lc ' +/home/python3.12/bin/python - <