updates

Vincent Li · Vincent Li · commit 6fe68b48a427 · 2026-04-01T07:40:32.000Z
diff --git a/.ci/config.yaml b/.ci/config.yaml
@@ -1,70 +1,146 @@
+repo:
+  url: https://github.com/InfiniTensor/InfiniOps.git
+  branch: master
+
+github:
+  status_context_prefix: "ci/infiniops"
+
+# Uncomment and replace the URLs below with actual host IPs to dispatch jobs to remote
+# machines via `agent.py run`. Required on the trigger machine when each platform's
+# agent runs on a separate host. See the README for multi-machine deployment details.
+# agents:
+#   nvidia:
+#     url: http://nvidia-host:8080
+#   iluvatar:
+#     url: http://iluvatar-host:8080
+#   metax:
+#     url: http://metax-host:8080
+#   moore:
+#     url: http://moore-host:8080
+#   cambricon:
+#     url: http://cambricon-host:8080
+
 platforms:
   nvidia:
     image:
+      dockerfile: .ci/images/nvidia/
       build_args:
-        BASE_IMAGE: vllm/vllm-openai:v0.10.0
+        BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3
+    setup: pip install .[dev] --no-build-isolation
+    jobs:
+      gpu:
+        resources:
+          ngpus: 1                         # Scheduler auto-picks this many free GPUs
+          memory: 32GB
+          shm_size: 16g                    # Prevent PyTorch default 64MB shared memory limit
+          timeout: 3600
+        # env:                             # Uncomment to inject extra env vars into the container.
+        #   MY_VAR: value
+        stages:
+          - name: test
+            run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml
+
+  iluvatar:
+    image:
+      dockerfile: .ci/images/iluvatar/
+      build_args:
+        BASE_IMAGE: corex:qs_pj20250825
+        APT_MIRROR: http://archive.ubuntu.com/ubuntu
+        PIP_INDEX_URL: https://pypi.org/simple
     docker_args:
-      - "--gpus all"
-      - "-u root"
-      - "--network host"
       - "--privileged"
       - "--cap-add=ALL"
       - "--pid=host"
       - "--ipc=host"
-      - "--entrypoint=''"
-      - "--workdir=/workspace"
-      - "-e CUDA_DEVICE_ORDER=PCI_BUS_ID"
     volumes:
       - /dev:/dev
-      - /home/zkjh/weight:/home/weight
-      - /home/zkjh/workspace:/workspace
+      - /lib/firmware:/lib/firmware
+      - /usr/src:/usr/src
+      - /lib/modules:/lib/modules
+    setup: pip install .[dev] --no-build-isolation
     jobs:
       gpu:
         resources:
-          ngpus: 2
-          shm_size: 80g
-        env:
-          MODEL_LIST: Qwen3-32B-FP8
-          IMAGE: v0.10.0
-          ENGINE: vLLM
-  ascend:
+          gpu_ids: "0"                     # GPU visibility via CUDA_VISIBLE_DEVICES
+          gpu_style: none                  # CoreX: passthrough via --privileged + /dev mount
+          memory: 32GB
+          shm_size: 16g
+          timeout: 3600
+        stages:
+          - name: test
+            run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml
+
+  metax:
     image:
+      dockerfile: .ci/images/metax/
       build_args:
-        BASE_IMAGE: swr.cn-south-1.myhuaweicloud.com/ascendhub/mindie:2.1.RC1-800I-A2-py311-openeuler24.03-lts
+        BASE_IMAGE: cr.metax-tech.com/public-library/maca-pytorch:3.2.1.4-torch2.4-py310-ubuntu22.04-amd64
+        APT_MIRROR: http://archive.ubuntu.com/ubuntu
+        PIP_INDEX_URL: https://pypi.org/simple
     docker_args:
       - "--privileged"
-      - "--cap-add=ALL"
-      - "--pid=host"
-      - "--ipc=host"
-      - "--net=host"
-      - "--device=/dev/davinci0"
-      - "--device=/dev/davinci1"
-      - "--device=/dev/davinci2"
-      - "--device=/dev/davinci3"
-      - "--device=/dev/davinci4"
-      - "--device=/dev/davinci5"
-      - "--device=/dev/davinci6"
-      - "--device=/dev/davinci7"
-      - "--device=/dev/davinci_manager"
-      - "--device=/dev/hisi_hdc"
-      - "--device=/dev/devmm_svm"
-    volumes:
-      - /usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64
-      - /usr/local/Ascend/driver/include:/usr/local/Ascend/driver/include
-      - /usr/local/Ascend/driver/tools:/usr/local/Ascend/driver/tools
-      - /usr/local/Ascend/driver:/usr/local/Ascend/driver
-      - /usr/local/Ascend/firmware:/usr/local/Ascend/firmware
-      - /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi
-      - /usr/local/sbin:/usr/local/sbin
-      - /etc/hccn.conf:/etc/hccn.conf
-      - /home/zkjh/weight:/home/weight
-      - /home/zkjh:/home/zkjh
+      - "--ulimit=memlock=-1"
+      - "--ulimit=stack=67108864"
+    setup: pip install .[dev] --no-build-isolation
+    jobs:
+      gpu:
+        resources:
+          gpu_ids: "0"
+          gpu_style: none                  # MetaX: passthrough via --privileged, no CUDA_VISIBLE_DEVICES
+          memory: 32GB
+          shm_size: 16g
+          timeout: 3600
+        stages:
+          - name: test
+            run: pytest tests/ -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml
+
+  moore:
+    image:
+      dockerfile: .ci/images/moore/
+      build_args:
+        BASE_IMAGE: sh-harbor.mthreads.com/mcctest/vllm_musa:20251112_hygon
+        APT_MIRROR: http://archive.ubuntu.com/ubuntu
+        PIP_INDEX_URL: https://pypi.org/simple
+    docker_args:
+      - "--privileged"
+    setup: pip install .[dev] --no-build-isolation
     jobs:
-      npu:
+      gpu:
         resources:
-          ngpus: 1
-          shm_size: 500g
-        env:
-          MODEL_LIST: DeepSeek-R1-Distill-Qwen-1.5B
-          IMAGE: 2.1.RC1-800I-A2-py311-openeuler24.03-lts
-          ENGINE: MindIE
+          gpu_ids: "0"
+          gpu_style: none                  # Moore: passthrough via --privileged, MTHREADS_VISIBLE_DEVICES set by base image
+          memory: 32GB
+          shm_size: 16g
+          timeout: 3600
+        stages:
+          - name: test
+            run: pytest tests/test_add.py tests/test_gemm.py tests/test_swiglu.py -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml
+
+  cambricon:
+    image:
+      dockerfile: .ci/images/cambricon/
+      build_args:
+        BASE_IMAGE: cambricon/pytorch:v1.25.3-torch2.1-anolisos8.8-py310
+        PIP_INDEX_URL: https://pypi.org/simple
+    docker_args:
+      - "--privileged"
+    setup: pip install .[dev] --no-build-isolation
+    jobs:
+      gpu:
+        resources:
+          gpu_ids: "0"
+          gpu_style: mlu                   # Cambricon: passthrough via --privileged, MLU_VISIBLE_DEVICES for device control
+          memory: 32GB
+          shm_size: 16g
+          timeout: 3600
+        stages:
+          - name: test
+            run: pytest tests/test_gemm.py -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml
+
+  ascend:                                  # TODO: Ascend image is not ready yet
+    image:
+      dockerfile: .ci/images/ascend/
+      build_args:
+        BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0
+      private_sdk:
+        source_env: PRIVATE_SDK_URL
diff --git a/.ci/config.yaml.1 b/.ci/config.yaml.1
@@ -0,0 +1,70 @@
+platforms:
+  nvidia:
+    image:
+      build_args:
+        BASE_IMAGE: vllm/vllm-openai:v0.10.0
+    docker_args:
+      - "--gpus all"
+      - "-u root"
+      - "--network host"
+      - "--privileged"
+      - "--cap-add=ALL"
+      - "--pid=host"
+      - "--ipc=host"
+      - "--entrypoint=''"
+      - "--workdir=/workspace"
+      - "-e CUDA_DEVICE_ORDER=PCI_BUS_ID"
+    volumes:
+      - /dev:/dev
+      - /home/zkjh/weight:/home/weight
+      - /home/zkjh/workspace:/workspace
+    jobs:
+      gpu:
+        resources:
+          ngpus: 2
+          shm_size: 80g
+        env:
+          MODEL_LIST: Qwen3-32B-FP8
+          IMAGE: v0.10.0
+          ENGINE: vLLM
+  ascend:
+    image:
+      build_args:
+        BASE_IMAGE: swr.cn-south-1.myhuaweicloud.com/ascendhub/mindie:2.1.RC1-800I-A2-py311-openeuler24.03-lts
+    docker_args:
+      - "--privileged"
+      - "--cap-add=ALL"
+      - "--pid=host"
+      - "--ipc=host"
+      - "--net=host"
+      - "--device=/dev/davinci0"
+      - "--device=/dev/davinci1"
+      - "--device=/dev/davinci2"
+      - "--device=/dev/davinci3"
+      - "--device=/dev/davinci4"
+      - "--device=/dev/davinci5"
+      - "--device=/dev/davinci6"
+      - "--device=/dev/davinci7"
+      - "--device=/dev/davinci_manager"
+      - "--device=/dev/hisi_hdc"
+      - "--device=/dev/devmm_svm"
+    volumes:
+      - /usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64
+      - /usr/local/Ascend/driver/include:/usr/local/Ascend/driver/include
+      - /usr/local/Ascend/driver/tools:/usr/local/Ascend/driver/tools
+      - /usr/local/Ascend/driver:/usr/local/Ascend/driver
+      - /usr/local/Ascend/firmware:/usr/local/Ascend/firmware
+      - /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi
+      - /usr/local/sbin:/usr/local/sbin
+      - /etc/hccn.conf:/etc/hccn.conf
+      - /home/zkjh/weight:/home/weight
+      - /home/zkjh:/home/zkjh
+    jobs:
+      npu:
+        resources:
+          ngpus: 1
+          shm_size: 500g
+        env:
+          MODEL_LIST: DeepSeek-R1-Distill-Qwen-1.5B
+          IMAGE: 2.1.RC1-800I-A2-py311-openeuler24.03-lts
+          ENGINE: MindIE
diff --git a/.github/workflows/ci_child.yml b/.github/workflows/ci_child.yml
@@ -21,6 +21,9 @@ jobs:
     runs-on: self-hosted
     timeout-minutes: ${{ matrix.timeout_minutes }}
     steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
       - name: Show stage plan
         env:
           MATRIX_JSON: ${{ inputs.matrix_json }}
@@ -57,38 +60,17 @@ jobs:
                   f.write("\n```\n\n")
           PY
 
-      - name: Trigger ${{ matrix.platform }} GPU Smoke Test job
-        env: ${{ matrix.job_env }}
+      - name: Build image via .ci/build.py
+        run: |
+          python3 .ci/build.py \
+            --config .ci/config.yaml \
+            --platform "${{ matrix.platform }}" \
+            --force
+
+      - name: Run stages via .ci/run.py (sequential inside job)
         run: |
-              echo "Smoke Test Starting..."
-              echo "MODEL_LIST = ${{ matrix.job_env.MODEL_LIST }}"
-              echo "Version = ${{ matrix.job_env.IMAGE }}"
-              set -m
-              cd /home/zkjh
-              if [ ! -d "${{ matrix.platform }}_test" ]; then
-                mkdir -p "${{ matrix.platform }}_test"
-              fi
-              cd ${{ matrix.platform }}_test
-              mkdir -p ${{ matrix.platform }}_${{ github.run_id }}_${{ matrix.id }}
-              cd ${{ matrix.platform }}_${{ github.run_id }}_${{ matrix.id }}
-              git init
-              git remote add origin git@github.com:Vincent777/ci_autotest.git
-              git fetch --depth=1 origin main
-              git show origin/main:ascend_test_suite/daemon.sh > daemon.sh
-              chmod a+x daemon.sh
-              DOCKER_ARGS="${{ join(matrix.docker_args, ' ') }}"
-              VOLUME_ARGS=$(printf " -v %s" ${{ join(matrix.volumes, ' ') }})
-              SHM_SIZE="--shm-size=${{ matrix.shm_size }}"
-              echo "DOCKER_ARGS=$DOCKER_ARGS"
-              echo "VOLUME_ARGS=$VOLUME_ARGS"
-              echo "SHM_SIZE=$SHM_SIZE"
-              PLATFORM=${{ matrix.platform }}
-              ./daemon.sh \
-                "${PLATFORM^}" \
-                Smoke \
-                ${{ matrix.job_env.ENGINE }}  \
-                "${{ matrix.job_env.MODEL_LIST }}:${{ matrix.ngpus }}" \
-                "${DOCKER_ARGS} ${VOLUME_ARGS} ${SHM_SIZE}" \
-                ${{ github.run_id }}  \
-                ${{ matrix.job_env.IMAGE }}
-  
+          python3 .ci/run.py \
+            --config .ci/config.yaml \
+            --job "${{ matrix.id }}" \
+            --local
+            
diff --git a/.github/workflows/ci_child.yml.1 b/.github/workflows/ci_child.yml.1