Skip to content

Commit 6fe68b4

Browse files
author
Vincent Li
committed
updates
1 parent 8c0a6e2 commit 6fe68b4

4 files changed

Lines changed: 306 additions & 84 deletions

File tree

.ci/config.yaml

Lines changed: 126 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,70 +1,146 @@
1+
repo:
2+
url: https://github.com/InfiniTensor/InfiniOps.git
3+
branch: master
4+
5+
github:
6+
status_context_prefix: "ci/infiniops"
7+
8+
# Uncomment and replace the URLs below with actual host IPs to dispatch jobs to remote
9+
# machines via `agent.py run`. Required on the trigger machine when each platform's
10+
# agent runs on a separate host. See the README for multi-machine deployment details.
11+
# agents:
12+
# nvidia:
13+
# url: http://nvidia-host:8080
14+
# iluvatar:
15+
# url: http://iluvatar-host:8080
16+
# metax:
17+
# url: http://metax-host:8080
18+
# moore:
19+
# url: http://moore-host:8080
20+
# cambricon:
21+
# url: http://cambricon-host:8080
22+
123
platforms:
224
nvidia:
325
image:
26+
dockerfile: .ci/images/nvidia/
427
build_args:
5-
BASE_IMAGE: vllm/vllm-openai:v0.10.0
28+
BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3
29+
setup: pip install .[dev] --no-build-isolation
30+
jobs:
31+
gpu:
32+
resources:
33+
ngpus: 1 # Scheduler auto-picks this many free GPUs
34+
memory: 32GB
35+
shm_size: 16g # Prevent PyTorch default 64MB shared memory limit
36+
timeout: 3600
37+
# env: # Uncomment to inject extra env vars into the container.
38+
# MY_VAR: value
39+
stages:
40+
- name: test
41+
run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml
42+
43+
iluvatar:
44+
image:
45+
dockerfile: .ci/images/iluvatar/
46+
build_args:
47+
BASE_IMAGE: corex:qs_pj20250825
48+
APT_MIRROR: http://archive.ubuntu.com/ubuntu
49+
PIP_INDEX_URL: https://pypi.org/simple
650
docker_args:
7-
- "--gpus all"
8-
- "-u root"
9-
- "--network host"
1051
- "--privileged"
1152
- "--cap-add=ALL"
1253
- "--pid=host"
1354
- "--ipc=host"
14-
- "--entrypoint=''"
15-
- "--workdir=/workspace"
16-
- "-e CUDA_DEVICE_ORDER=PCI_BUS_ID"
1755
volumes:
1856
- /dev:/dev
19-
- /home/zkjh/weight:/home/weight
20-
- /home/zkjh/workspace:/workspace
57+
- /lib/firmware:/lib/firmware
58+
- /usr/src:/usr/src
59+
- /lib/modules:/lib/modules
60+
setup: pip install .[dev] --no-build-isolation
2161
jobs:
2262
gpu:
2363
resources:
24-
ngpus: 2
25-
shm_size: 80g
26-
env:
27-
MODEL_LIST: Qwen3-32B-FP8
28-
IMAGE: v0.10.0
29-
ENGINE: vLLM
30-
ascend:
64+
gpu_ids: "0" # GPU visibility via CUDA_VISIBLE_DEVICES
65+
gpu_style: none # CoreX: passthrough via --privileged + /dev mount
66+
memory: 32GB
67+
shm_size: 16g
68+
timeout: 3600
69+
stages:
70+
- name: test
71+
run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml
72+
73+
metax:
3174
image:
75+
dockerfile: .ci/images/metax/
3276
build_args:
33-
BASE_IMAGE: swr.cn-south-1.myhuaweicloud.com/ascendhub/mindie:2.1.RC1-800I-A2-py311-openeuler24.03-lts
77+
BASE_IMAGE: cr.metax-tech.com/public-library/maca-pytorch:3.2.1.4-torch2.4-py310-ubuntu22.04-amd64
78+
APT_MIRROR: http://archive.ubuntu.com/ubuntu
79+
PIP_INDEX_URL: https://pypi.org/simple
3480
docker_args:
3581
- "--privileged"
36-
- "--cap-add=ALL"
37-
- "--pid=host"
38-
- "--ipc=host"
39-
- "--net=host"
40-
- "--device=/dev/davinci0"
41-
- "--device=/dev/davinci1"
42-
- "--device=/dev/davinci2"
43-
- "--device=/dev/davinci3"
44-
- "--device=/dev/davinci4"
45-
- "--device=/dev/davinci5"
46-
- "--device=/dev/davinci6"
47-
- "--device=/dev/davinci7"
48-
- "--device=/dev/davinci_manager"
49-
- "--device=/dev/hisi_hdc"
50-
- "--device=/dev/devmm_svm"
51-
volumes:
52-
- /usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64
53-
- /usr/local/Ascend/driver/include:/usr/local/Ascend/driver/include
54-
- /usr/local/Ascend/driver/tools:/usr/local/Ascend/driver/tools
55-
- /usr/local/Ascend/driver:/usr/local/Ascend/driver
56-
- /usr/local/Ascend/firmware:/usr/local/Ascend/firmware
57-
- /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi
58-
- /usr/local/sbin:/usr/local/sbin
59-
- /etc/hccn.conf:/etc/hccn.conf
60-
- /home/zkjh/weight:/home/weight
61-
- /home/zkjh:/home/zkjh
82+
- "--ulimit=memlock=-1"
83+
- "--ulimit=stack=67108864"
84+
setup: pip install .[dev] --no-build-isolation
85+
jobs:
86+
gpu:
87+
resources:
88+
gpu_ids: "0"
89+
gpu_style: none # MetaX: passthrough via --privileged, no CUDA_VISIBLE_DEVICES
90+
memory: 32GB
91+
shm_size: 16g
92+
timeout: 3600
93+
stages:
94+
- name: test
95+
run: pytest tests/ -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml
96+
97+
moore:
98+
image:
99+
dockerfile: .ci/images/moore/
100+
build_args:
101+
BASE_IMAGE: sh-harbor.mthreads.com/mcctest/vllm_musa:20251112_hygon
102+
APT_MIRROR: http://archive.ubuntu.com/ubuntu
103+
PIP_INDEX_URL: https://pypi.org/simple
104+
docker_args:
105+
- "--privileged"
106+
setup: pip install .[dev] --no-build-isolation
62107
jobs:
63-
npu:
108+
gpu:
64109
resources:
65-
ngpus: 1
66-
shm_size: 500g
67-
env:
68-
MODEL_LIST: DeepSeek-R1-Distill-Qwen-1.5B
69-
IMAGE: 2.1.RC1-800I-A2-py311-openeuler24.03-lts
70-
ENGINE: MindIE
110+
gpu_ids: "0"
111+
gpu_style: none # Moore: passthrough via --privileged, MTHREADS_VISIBLE_DEVICES set by base image
112+
memory: 32GB
113+
shm_size: 16g
114+
timeout: 3600
115+
stages:
116+
- name: test
117+
run: pytest tests/test_add.py tests/test_gemm.py tests/test_swiglu.py -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml
118+
119+
cambricon:
120+
image:
121+
dockerfile: .ci/images/cambricon/
122+
build_args:
123+
BASE_IMAGE: cambricon/pytorch:v1.25.3-torch2.1-anolisos8.8-py310
124+
PIP_INDEX_URL: https://pypi.org/simple
125+
docker_args:
126+
- "--privileged"
127+
setup: pip install .[dev] --no-build-isolation
128+
jobs:
129+
gpu:
130+
resources:
131+
gpu_ids: "0"
132+
gpu_style: mlu # Cambricon: passthrough via --privileged, MLU_VISIBLE_DEVICES for device control
133+
memory: 32GB
134+
shm_size: 16g
135+
timeout: 3600
136+
stages:
137+
- name: test
138+
run: pytest tests/test_gemm.py -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml
139+
140+
ascend: # TODO: Ascend image is not ready yet
141+
image:
142+
dockerfile: .ci/images/ascend/
143+
build_args:
144+
BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0
145+
private_sdk:
146+
source_env: PRIVATE_SDK_URL

.ci/config.yaml.1

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
platforms:
2+
nvidia:
3+
image:
4+
build_args:
5+
BASE_IMAGE: vllm/vllm-openai:v0.10.0
6+
docker_args:
7+
- "--gpus all"
8+
- "-u root"
9+
- "--network host"
10+
- "--privileged"
11+
- "--cap-add=ALL"
12+
- "--pid=host"
13+
- "--ipc=host"
14+
- "--entrypoint=''"
15+
- "--workdir=/workspace"
16+
- "-e CUDA_DEVICE_ORDER=PCI_BUS_ID"
17+
volumes:
18+
- /dev:/dev
19+
- /home/zkjh/weight:/home/weight
20+
- /home/zkjh/workspace:/workspace
21+
jobs:
22+
gpu:
23+
resources:
24+
ngpus: 2
25+
shm_size: 80g
26+
env:
27+
MODEL_LIST: Qwen3-32B-FP8
28+
IMAGE: v0.10.0
29+
ENGINE: vLLM
30+
ascend:
31+
image:
32+
build_args:
33+
BASE_IMAGE: swr.cn-south-1.myhuaweicloud.com/ascendhub/mindie:2.1.RC1-800I-A2-py311-openeuler24.03-lts
34+
docker_args:
35+
- "--privileged"
36+
- "--cap-add=ALL"
37+
- "--pid=host"
38+
- "--ipc=host"
39+
- "--net=host"
40+
- "--device=/dev/davinci0"
41+
- "--device=/dev/davinci1"
42+
- "--device=/dev/davinci2"
43+
- "--device=/dev/davinci3"
44+
- "--device=/dev/davinci4"
45+
- "--device=/dev/davinci5"
46+
- "--device=/dev/davinci6"
47+
- "--device=/dev/davinci7"
48+
- "--device=/dev/davinci_manager"
49+
- "--device=/dev/hisi_hdc"
50+
- "--device=/dev/devmm_svm"
51+
volumes:
52+
- /usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64
53+
- /usr/local/Ascend/driver/include:/usr/local/Ascend/driver/include
54+
- /usr/local/Ascend/driver/tools:/usr/local/Ascend/driver/tools
55+
- /usr/local/Ascend/driver:/usr/local/Ascend/driver
56+
- /usr/local/Ascend/firmware:/usr/local/Ascend/firmware
57+
- /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi
58+
- /usr/local/sbin:/usr/local/sbin
59+
- /etc/hccn.conf:/etc/hccn.conf
60+
- /home/zkjh/weight:/home/weight
61+
- /home/zkjh:/home/zkjh
62+
jobs:
63+
npu:
64+
resources:
65+
ngpus: 1
66+
shm_size: 500g
67+
env:
68+
MODEL_LIST: DeepSeek-R1-Distill-Qwen-1.5B
69+
IMAGE: 2.1.RC1-800I-A2-py311-openeuler24.03-lts
70+
ENGINE: MindIE

.github/workflows/ci_child.yml

Lines changed: 16 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ jobs:
2121
runs-on: self-hosted
2222
timeout-minutes: ${{ matrix.timeout_minutes }}
2323
steps:
24+
- name: Checkout
25+
uses: actions/checkout@v4
26+
2427
- name: Show stage plan
2528
env:
2629
MATRIX_JSON: ${{ inputs.matrix_json }}
@@ -57,38 +60,17 @@ jobs:
5760
f.write("\n```\n\n")
5861
PY
5962
60-
- name: Trigger ${{ matrix.platform }} GPU Smoke Test job
61-
env: ${{ matrix.job_env }}
63+
- name: Build image via .ci/build.py
64+
run: |
65+
python3 .ci/build.py \
66+
--config .ci/config.yaml \
67+
--platform "${{ matrix.platform }}" \
68+
--force
69+
70+
- name: Run stages via .ci/run.py (sequential inside job)
6271
run: |
63-
echo "Smoke Test Starting..."
64-
echo "MODEL_LIST = ${{ matrix.job_env.MODEL_LIST }}"
65-
echo "Version = ${{ matrix.job_env.IMAGE }}"
66-
set -m
67-
cd /home/zkjh
68-
if [ ! -d "${{ matrix.platform }}_test" ]; then
69-
mkdir -p "${{ matrix.platform }}_test"
70-
fi
71-
cd ${{ matrix.platform }}_test
72-
mkdir -p ${{ matrix.platform }}_${{ github.run_id }}_${{ matrix.id }}
73-
cd ${{ matrix.platform }}_${{ github.run_id }}_${{ matrix.id }}
74-
git init
75-
git remote add origin git@github.com:Vincent777/ci_autotest.git
76-
git fetch --depth=1 origin main
77-
git show origin/main:ascend_test_suite/daemon.sh > daemon.sh
78-
chmod a+x daemon.sh
79-
DOCKER_ARGS="${{ join(matrix.docker_args, ' ') }}"
80-
VOLUME_ARGS=$(printf " -v %s" ${{ join(matrix.volumes, ' ') }})
81-
SHM_SIZE="--shm-size=${{ matrix.shm_size }}"
82-
echo "DOCKER_ARGS=$DOCKER_ARGS"
83-
echo "VOLUME_ARGS=$VOLUME_ARGS"
84-
echo "SHM_SIZE=$SHM_SIZE"
85-
PLATFORM=${{ matrix.platform }}
86-
./daemon.sh \
87-
"${PLATFORM^}" \
88-
Smoke \
89-
${{ matrix.job_env.ENGINE }} \
90-
"${{ matrix.job_env.MODEL_LIST }}:${{ matrix.ngpus }}" \
91-
"${DOCKER_ARGS} ${VOLUME_ARGS} ${SHM_SIZE}" \
92-
${{ github.run_id }} \
93-
${{ matrix.job_env.IMAGE }}
94-
72+
python3 .ci/run.py \
73+
--config .ci/config.yaml \
74+
--job "${{ matrix.id }}" \
75+
--local
76+

0 commit comments

Comments
 (0)