|
| 1 | +repo: |
| 2 | + url: https://github.com/InfiniTensor/InfiniOps.git |
| 3 | + branch: master |
| 4 | + |
| 5 | +github: |
| 6 | + status_context_prefix: "ci/infiniops" |
| 7 | + |
| 8 | +# Uncomment and replace the URLs below with actual host IPs to dispatch jobs to remote |
| 9 | +# machines via `agent.py run`. Required on the trigger machine when each platform's |
| 10 | +# agent runs on a separate host. See the README for multi-machine deployment details. |
| 11 | +# agents: |
| 12 | +# nvidia: |
| 13 | +# url: http://nvidia-host:8080 |
| 14 | +# iluvatar: |
| 15 | +# url: http://iluvatar-host:8080 |
| 16 | +# metax: |
| 17 | +# url: http://metax-host:8080 |
| 18 | +# moore: |
| 19 | +# url: http://moore-host:8080 |
| 20 | +# cambricon: |
| 21 | +# url: http://cambricon-host:8080 |
| 22 | + |
1 | 23 | platforms: |
2 | 24 | nvidia: |
3 | 25 | image: |
| 26 | + dockerfile: .ci/images/nvidia/ |
4 | 27 | build_args: |
5 | | - BASE_IMAGE: vllm/vllm-openai:v0.10.0 |
| 28 | + BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 |
| 29 | + setup: pip install .[dev] --no-build-isolation |
| 30 | + jobs: |
| 31 | + gpu: |
| 32 | + resources: |
| 33 | + ngpus: 1 # Scheduler auto-picks this many free GPUs |
| 34 | + memory: 32GB |
| 35 | + shm_size: 16g # Prevent PyTorch default 64MB shared memory limit |
| 36 | + timeout: 3600 |
| 37 | + # env: # Uncomment to inject extra env vars into the container. |
| 38 | + # MY_VAR: value |
| 39 | + stages: |
| 40 | + - name: test |
| 41 | + run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml |
| 42 | + |
| 43 | + iluvatar: |
| 44 | + image: |
| 45 | + dockerfile: .ci/images/iluvatar/ |
| 46 | + build_args: |
| 47 | + BASE_IMAGE: corex:qs_pj20250825 |
| 48 | + APT_MIRROR: http://archive.ubuntu.com/ubuntu |
| 49 | + PIP_INDEX_URL: https://pypi.org/simple |
6 | 50 | docker_args: |
7 | | - - "--gpus all" |
8 | | - - "-u root" |
9 | | - - "--network host" |
10 | 51 | - "--privileged" |
11 | 52 | - "--cap-add=ALL" |
12 | 53 | - "--pid=host" |
13 | 54 | - "--ipc=host" |
14 | | - - "--entrypoint=''" |
15 | | - - "--workdir=/workspace" |
16 | | - - "-e CUDA_DEVICE_ORDER=PCI_BUS_ID" |
17 | 55 | volumes: |
18 | 56 | - /dev:/dev |
19 | | - - /home/zkjh/weight:/home/weight |
20 | | - - /home/zkjh/workspace:/workspace |
| 57 | + - /lib/firmware:/lib/firmware |
| 58 | + - /usr/src:/usr/src |
| 59 | + - /lib/modules:/lib/modules |
| 60 | + setup: pip install .[dev] --no-build-isolation |
21 | 61 | jobs: |
22 | 62 | gpu: |
23 | 63 | resources: |
24 | | - ngpus: 2 |
25 | | - shm_size: 80g |
26 | | - env: |
27 | | - MODEL_LIST: Qwen3-32B-FP8 |
28 | | - IMAGE: v0.10.0 |
29 | | - ENGINE: vLLM |
30 | | - ascend: |
| 64 | + gpu_ids: "0" # GPU visibility via CUDA_VISIBLE_DEVICES |
| 65 | + gpu_style: none # CoreX: passthrough via --privileged + /dev mount |
| 66 | + memory: 32GB |
| 67 | + shm_size: 16g |
| 68 | + timeout: 3600 |
| 69 | + stages: |
| 70 | + - name: test |
| 71 | + run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml |
| 72 | + |
| 73 | + metax: |
31 | 74 | image: |
| 75 | + dockerfile: .ci/images/metax/ |
32 | 76 | build_args: |
33 | | - BASE_IMAGE: swr.cn-south-1.myhuaweicloud.com/ascendhub/mindie:2.1.RC1-800I-A2-py311-openeuler24.03-lts |
| 77 | + BASE_IMAGE: cr.metax-tech.com/public-library/maca-pytorch:3.2.1.4-torch2.4-py310-ubuntu22.04-amd64 |
| 78 | + APT_MIRROR: http://archive.ubuntu.com/ubuntu |
| 79 | + PIP_INDEX_URL: https://pypi.org/simple |
34 | 80 | docker_args: |
35 | 81 | - "--privileged" |
36 | | - - "--cap-add=ALL" |
37 | | - - "--pid=host" |
38 | | - - "--ipc=host" |
39 | | - - "--net=host" |
40 | | - - "--device=/dev/davinci0" |
41 | | - - "--device=/dev/davinci1" |
42 | | - - "--device=/dev/davinci2" |
43 | | - - "--device=/dev/davinci3" |
44 | | - - "--device=/dev/davinci4" |
45 | | - - "--device=/dev/davinci5" |
46 | | - - "--device=/dev/davinci6" |
47 | | - - "--device=/dev/davinci7" |
48 | | - - "--device=/dev/davinci_manager" |
49 | | - - "--device=/dev/hisi_hdc" |
50 | | - - "--device=/dev/devmm_svm" |
51 | | - volumes: |
52 | | - - /usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64 |
53 | | - - /usr/local/Ascend/driver/include:/usr/local/Ascend/driver/include |
54 | | - - /usr/local/Ascend/driver/tools:/usr/local/Ascend/driver/tools |
55 | | - - /usr/local/Ascend/driver:/usr/local/Ascend/driver |
56 | | - - /usr/local/Ascend/firmware:/usr/local/Ascend/firmware |
57 | | - - /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi |
58 | | - - /usr/local/sbin:/usr/local/sbin |
59 | | - - /etc/hccn.conf:/etc/hccn.conf |
60 | | - - /home/zkjh/weight:/home/weight |
61 | | - - /home/zkjh:/home/zkjh |
| 82 | + - "--ulimit=memlock=-1" |
| 83 | + - "--ulimit=stack=67108864" |
| 84 | + setup: pip install .[dev] --no-build-isolation |
| 85 | + jobs: |
| 86 | + gpu: |
| 87 | + resources: |
| 88 | + gpu_ids: "0" |
| 89 | + gpu_style: none # MetaX: passthrough via --privileged, no CUDA_VISIBLE_DEVICES |
| 90 | + memory: 32GB |
| 91 | + shm_size: 16g |
| 92 | + timeout: 3600 |
| 93 | + stages: |
| 94 | + - name: test |
| 95 | + run: pytest tests/ -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml |
| 96 | + |
| 97 | + moore: |
| 98 | + image: |
| 99 | + dockerfile: .ci/images/moore/ |
| 100 | + build_args: |
| 101 | + BASE_IMAGE: sh-harbor.mthreads.com/mcctest/vllm_musa:20251112_hygon |
| 102 | + APT_MIRROR: http://archive.ubuntu.com/ubuntu |
| 103 | + PIP_INDEX_URL: https://pypi.org/simple |
| 104 | + docker_args: |
| 105 | + - "--privileged" |
| 106 | + setup: pip install .[dev] --no-build-isolation |
62 | 107 | jobs: |
63 | | - npu: |
| 108 | + gpu: |
64 | 109 | resources: |
65 | | - ngpus: 1 |
66 | | - shm_size: 500g |
67 | | - env: |
68 | | - MODEL_LIST: DeepSeek-R1-Distill-Qwen-1.5B |
69 | | - IMAGE: 2.1.RC1-800I-A2-py311-openeuler24.03-lts |
70 | | - ENGINE: MindIE |
| 110 | + gpu_ids: "0" |
| 111 | + gpu_style: none # Moore: passthrough via --privileged, MTHREADS_VISIBLE_DEVICES set by base image |
| 112 | + memory: 32GB |
| 113 | + shm_size: 16g |
| 114 | + timeout: 3600 |
| 115 | + stages: |
| 116 | + - name: test |
| 117 | + run: pytest tests/test_add.py tests/test_gemm.py tests/test_swiglu.py -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml |
| 118 | + |
| 119 | + cambricon: |
| 120 | + image: |
| 121 | + dockerfile: .ci/images/cambricon/ |
| 122 | + build_args: |
| 123 | + BASE_IMAGE: cambricon/pytorch:v1.25.3-torch2.1-anolisos8.8-py310 |
| 124 | + PIP_INDEX_URL: https://pypi.org/simple |
| 125 | + docker_args: |
| 126 | + - "--privileged" |
| 127 | + setup: pip install .[dev] --no-build-isolation |
| 128 | + jobs: |
| 129 | + gpu: |
| 130 | + resources: |
| 131 | + gpu_ids: "0" |
| 132 | + gpu_style: mlu # Cambricon: passthrough via --privileged, MLU_VISIBLE_DEVICES for device control |
| 133 | + memory: 32GB |
| 134 | + shm_size: 16g |
| 135 | + timeout: 3600 |
| 136 | + stages: |
| 137 | + - name: test |
| 138 | + run: pytest tests/test_gemm.py -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml |
| 139 | + |
| 140 | + ascend: # TODO: Ascend image is not ready yet |
| 141 | + image: |
| 142 | + dockerfile: .ci/images/ascend/ |
| 143 | + build_args: |
| 144 | + BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0 |
| 145 | + private_sdk: |
| 146 | + source_env: PRIVATE_SDK_URL |
0 commit comments