From 2d1a3e2e6c06bc25812ea49e193ee56754ec660c Mon Sep 17 00:00:00 2001 From: Kurt Garloff Date: Tue, 16 Jun 2026 12:30:47 +0200 Subject: [PATCH 1/4] Add some workstation and some new GPUs. This way we keep the tables current and helpful. Signed-off-by: Kurt Garloff --- Standards/scs-0100-v3-flavor-naming.md | 3 +- ...w1-flavor-naming-implementation-testing.md | 101 ++++++++++++++++-- 2 files changed, 95 insertions(+), 9 deletions(-) diff --git a/Standards/scs-0100-v3-flavor-naming.md b/Standards/scs-0100-v3-flavor-naming.md index 336bc0d14..eaa329423 100644 --- a/Standards/scs-0100-v3-flavor-naming.md +++ b/Standards/scs-0100-v3-flavor-naming.md @@ -438,7 +438,8 @@ Note that the vendor letter X is mandatory, generation and processing units are | `A` | AMD | compute units (CUs) | | `I` | Intel | execution units (EUs) | -For nVidia, the generation N can be f=Fermi, k=Kepler, m=Maxwell, p=Pascal, v=Volta, t=turing, a=Ampere, l=Ada Lovelace, g=Grace Hopper, b=Blackwell, ..., +For nVidia, the generation N can be f=Fermi, k=Kepler, m=Maxwell, p=Pascal, v=Volta, t=turing, a=Ampere, +l=Ada Lovelace, g=Grace Hopper, b=Blackwell, ..., for AMD GCN-x=0.x, CDNA-x=x, RDNA-x=x.1, RDNA-3.5=3.5, UDNA-x=x for Intel Gen9=0.9, Xe(12.1/DG1)=1, Xe(12.2)=2, Arc(12.7/DG2)=3, BattleImage(20.0)=4, ... (Note: This may need further work to properly reflect what's out there.) diff --git a/Standards/scs-0100-w1-flavor-naming-implementation-testing.md b/Standards/scs-0100-w1-flavor-naming-implementation-testing.md index d043ba28b..2bdc9f6b8 100644 --- a/Standards/scs-0100-w1-flavor-naming-implementation-testing.md +++ b/Standards/scs-0100-w1-flavor-naming-implementation-testing.md @@ -48,7 +48,10 @@ possibly recommended flavors can be created, or the user can set a file containi ### GPU table The most commonly used datacenter GPUs are listed here, showing what GPUs (or partitions -of a GPU) result in what GPU part of the flavor name. +of a GPU) result in what GPU part of the flavor name. We provide these for convenience; most +values are from data sheets and not based on own testing. Providers must look up the values +(SMs/CUs/EUs and VRAM) really provided to users and correctly fill these into the SCS names. +This is in particular true for the MIG configurations. #### Nvidia (`N`) @@ -95,6 +98,15 @@ No MIG support, 128 Cuda Cores and 4 Tensor Cores per SM. | L40G | 568 | 18176 | 142 | 48G GDDR6 | `GNl-142h-48` | | L40S | 568 | 18176 | 142 | 48G GDDR6 | `GNl-142hh-48` | +| Nvidia GPU | Tensor C | Cuda Cores | SMs | VRAM | SCS name piece | +|--------------|----------|------------|-----|-----------|----------------| +| RTX2000 Ada | 88 | 2816 | 22 | 16G GDDR6 | `GNl-22-16` | +| RTX4000 Ada | 192 | 6144 | 48 | 20G GDDR6 | `GNl-48-20` | +| RTX4500 Ada | 240 | 7680 | 60 | 24G GDDR6 | `GNl-60-24` | +| RTX5000 Ada | 400 | 12800 | 100 | 32G GDDR6 | `GNl-100-32` | +| RTX5880 Ada | 440 | 14080 | 110 | 48G GDDR6 | `GNl-110-48` | +| RTX6000 Ada | 568 | 18176 | 142 | 48G GDDR6 | `GNl-142-48` | + ##### Grace Hopper (`g`) These have MIG support and 128 Cuda Cores and 4 Tensor Cores per SM. @@ -112,6 +124,35 @@ These have MIG support and 128 Cuda Cores and 4 Tensor Cores per SM. [+] The precise numbers for the 1/7 MIG configurations are not known by the author of this document and need validation. +##### Blackwell (`b`) + +These have MIG support and 128 Cuda Cores and 4 Tensor Cores per SM. + +| Nvidia GPU | Fraction | Tensor C | Cuda Cores | SMs | VRAM | SCS GPU name | +|------------|----------|----------|------------|-----|------------|----------------| +| GB200 | 1/1 | 640 | 20480 | 160 | 192G HBM3e | `GNb-160-192h` | +| GB200 | 1/2 | 320 | 10240 | 80 | 96G HBM3e | `GNb-80-96h` | +| GB200 | 2/7 | 88+ | 5632+ | 44+| 45G HBM3e+| `GNb-44-45h`+ | +| GB200 | 1/7 | 44+ | 2816+ | 22+| 23G HBM3e+| `GNb-22-23h`+ | +| GB300 | 1/1 | 640 | 20480 | 160 | 288G HBM3e | `GNb-160-288h` | +| GB300 | 1/2 | 320 | 10240 | 80 | 144G HBM3e | `GNb-80-144h` | +| ... | + +[+] The precise numbers for the 1/7 MIG configurations are not known by the author of +this document and need validation. + +| Nvidia GPU | Fraction | Tensor C | Cuda Cores | SMs | VRAM | SCS GPU name | +|-----------------------|----------|----------|------------|-----|------------|----------------| +| RTX Pro2000 Blackwell | 1/1 | 136 | 4352 | 34 | 16G GDDR7 | `GNb-34-16` | +| RTX Pro4000 Blackwell | 1/1 | 280 | 8960 | 70 | 24G GDDR7 | `GNb-70-24` | +| RTX Pro4500 Blackwell | 1/1 | 328 | 10496 | 82 | 32G GDDR7 | `GNb-82-32` | +| RTX Pro5000 Blackwell | 1/1 | 440 | 14080 | 110 | 72G GDDR7 | `GNb-110-72` | +| RTX Pro5000 Blackwell | 1/2 | 220 | 7040 | 55 | 36G GDDR7 | `GNb-55-36` | +| RTX Pro6000 Blackwell | 1/1 | 752 | 26064 | 188 | 96G GDDR7 | `GNb-188-96` | +| RTX Pro6000 Blackwell | 1/2 | 376 | 13032 | 94 | 48G GDDR7 | `GNb-94-48` | +| RTX Pro6000 Blackwell | 1/4 | 188 | 6516 | 47 | 24G GDDR7 | `GNb-47-24` | + + #### AMD Radeon (`A`) ##### CDNA 2 (`2`) @@ -130,28 +171,72 @@ SRIOV partitioning is possible, resulting in pass-through for up to 8 partitions, somewhat similar to Nvidia MIG. 4 Tensor Cores and 64 Stream Processors per CU. -| AMD GPU | Tensor C | Stream Proc | CUs | VRAM | SCS name piece | -|-------------|----------|-------------|-----|------------|----------------| -| Inst MI300X | 1216 | 19456 | 304 | 192G HBM3 | `GA3-304-192h` | -| Inst MI325X | 1216 | 19456 | 304 | 288G HBM3 | `GA3-304-288h` | +| AMD GPU | Tensor C | Stream Proc | CUs | VRAM | SCS name piece | +|-------------|----------|-------------|-----|------------|-----------------| +| Inst MI300X | 1216 | 19456 | 304 | 192G HBM3 | `GA3-304-192h` | +| Inst MI325X | 1216 | 19456 | 304 | 288G HBM3 | `GA3-304-288h` | + +##### CDNA 4 (`4`) + +SRIOV partitioning is possible, resulting in pass-through for +up to 8 partitions, somewhat similar to Nvidia MIG. 4 Tensor +Cores and 64 Stream Processors per CU. + +| AMD GPU | Tensor C | Stream Proc | CUs | VRAM | SCS name piece | +|-------------|----------|-------------|-----|------------|-----------------| +| Inst MI350X | 1024 | 16384 | 256 | 288G HBM3e | `GA4-256-288h` | +| Inst MI355X | 1024 | 16384 | 256 | 288G HBM3e | `GA4-256h-288h` | + +The Instinct MI355X has a higher watttage and thus slightly higher clocks +than the MI350X but is otherwise identical -- we can thus use the `h` modifier +to identify the higher performance version. + +##### Workstation RDNA 3 (`3.1`) and 4 (`4.1`) + +2 Tensor Cores and 64 Stream Processors per CU. + +| AMD Radeon | Tensor C | Stream Proc | CUs | VRAM | SCS name piece | +|--------------|----------|-------------|-----|------------|-----------------| +| Pro W7900 | 196 | 6144 | 96 | 48G GDDR6 | `GA3.1-96-48` | +| AI Pro R9700 | 128 | 4096 | 64 | 32G GDDR6 | `GA4.1-64-32` | Note that we previously assumed more similarity of consumer RDNA-x with -server CDNA-x that actually is the case; the RDNA-x cards now use `x.1` +server CDNA-x than actually is the case; the RDNA-x cards now use `x.1` (since v3.3 as of Oct 2025) to be able to differentiate them. We will tolerate potential rare cases of old installations calling RDNA-x as -generation `x` for the time being. +generation `x` for the time being. If AMD executes on the merging with +UDNA-5, we will avoid this split in the future. #### intel Xe (`I`) ##### Xe-HPC (Ponte Vecchio) (`3`) -1 EU corresponds to one Tensor Core and contains 128 Shading Units. +One EU corresponds to one Tensor Core and contains 128 Shading Units. | intel DC GPU | Tensor C | Shading U | EUs | VRAM | SCS name part | |--------------|----------|-----------|-----|------------|----------------| | Max 1100 | 56 | 7168 | 56 | 48G HBM2e | `GI3-56-48h` | | Max 1550 | 128 | 16384 | 128 | 128G HBM2e | `GI3-128-128h` | +##### Workstation cards Arc B (`4`) + +One EU has one tensor core and 16 shading units. + +| intel GPU | Tensor C | Shading U | EUs | VRAM | SCS name part | +|-------------|----------|-----------|-----|------------|----------------| +| Arc Pro B50 | 128 | 2048 | 128 | 16G GDDR6 | `GI4-128-16` | +| Arc Pro B60 | 160 | 2560 | 160 | 24G GDDR6 | `GI4-160-24` | +| Arc Pro B65 | 160 | 2560 | 160 | 32G GDDR6 | `GI4-160-32` | +| Arc Pro B70 | 256 | 4096 | 256 | 32G GDDR6 | `GI4-256-32` | + +#### Consumer cards + +Note that we don't recommend using consumer cards. +That said, the schema allows to specify them and for example do PCI pass-through +of Nvidia RTX4080S (`GNl-80-16`), RTX4090 (`GNl-128-24`), RTX5080S (`GNb-84-24`), +RTX5090 (`GNb-170-32`), or AMD Radeon RX7900XTX (`GA3.1-96-24`). + + ## Automated tests The following testcases [are implemented](https://github.com/SovereignCloudStack/standards/tree/main/Tests/iaas/openstack_test.py): From 768317913dd8dda2091a7642d806cf98d25bbedf Mon Sep 17 00:00:00 2001 From: Kurt Garloff Date: Tue, 16 Jun 2026 12:46:54 +0200 Subject: [PATCH 2/4] New letter u for Blackwell Ultra. The tensor cores are different enough to justify this. Main use case for GPUs in clouds is AI and the tensor cores do make a difference there. Signed-off-by: Kurt Garloff --- Standards/scs-0100-v3-flavor-naming.md | 2 +- ...cs-0100-w1-flavor-naming-implementation-testing.md | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/Standards/scs-0100-v3-flavor-naming.md b/Standards/scs-0100-v3-flavor-naming.md index eaa329423..ca09253bc 100644 --- a/Standards/scs-0100-v3-flavor-naming.md +++ b/Standards/scs-0100-v3-flavor-naming.md @@ -439,7 +439,7 @@ Note that the vendor letter X is mandatory, generation and processing units are | `I` | Intel | execution units (EUs) | For nVidia, the generation N can be f=Fermi, k=Kepler, m=Maxwell, p=Pascal, v=Volta, t=turing, a=Ampere, -l=Ada Lovelace, g=Grace Hopper, b=Blackwell, ..., +l=Ada Lovelace, g=Grace Hopper, b=Blackwell, u=BlackwellUltra, ..., for AMD GCN-x=0.x, CDNA-x=x, RDNA-x=x.1, RDNA-3.5=3.5, UDNA-x=x for Intel Gen9=0.9, Xe(12.1/DG1)=1, Xe(12.2)=2, Arc(12.7/DG2)=3, BattleImage(20.0)=4, ... (Note: This may need further work to properly reflect what's out there.) diff --git a/Standards/scs-0100-w1-flavor-naming-implementation-testing.md b/Standards/scs-0100-w1-flavor-naming-implementation-testing.md index 2bdc9f6b8..7525dc544 100644 --- a/Standards/scs-0100-w1-flavor-naming-implementation-testing.md +++ b/Standards/scs-0100-w1-flavor-naming-implementation-testing.md @@ -124,7 +124,7 @@ These have MIG support and 128 Cuda Cores and 4 Tensor Cores per SM. [+] The precise numbers for the 1/7 MIG configurations are not known by the author of this document and need validation. -##### Blackwell (`b`) +##### Blackwell (`b`) and Blackwell Ultra (`u`) These have MIG support and 128 Cuda Cores and 4 Tensor Cores per SM. @@ -134,13 +134,18 @@ These have MIG support and 128 Cuda Cores and 4 Tensor Cores per SM. | GB200 | 1/2 | 320 | 10240 | 80 | 96G HBM3e | `GNb-80-96h` | | GB200 | 2/7 | 88+ | 5632+ | 44+| 45G HBM3e+| `GNb-44-45h`+ | | GB200 | 1/7 | 44+ | 2816+ | 22+| 23G HBM3e+| `GNb-22-23h`+ | -| GB300 | 1/1 | 640 | 20480 | 160 | 288G HBM3e | `GNb-160-288h` | -| GB300 | 1/2 | 320 | 10240 | 80 | 144G HBM3e | `GNb-80-144h` | +| GB300 | 1/1 | 640 | 20480 | 160 | 288G HBM3e | `GNu-160-288h` | +| GB300 | 1/2 | 320 | 10240 | 80 | 144G HBM3e | `GNu-80-144h` | | ... | [+] The precise numbers for the 1/7 MIG configurations are not known by the author of this document and need validation. +Note that Blackwell Ultra tensor cores have significant enough changes vs. Blackwell that we +gave the BW Ultra GPUs a new letter `u`. In particular, FP4 tensor performance is over 150% +of std. Blackwell and has more Special Function Units (which helps attention) but has +regressed INT8 performance. + | Nvidia GPU | Fraction | Tensor C | Cuda Cores | SMs | VRAM | SCS GPU name | |-----------------------|----------|----------|------------|-----|------------|----------------| | RTX Pro2000 Blackwell | 1/1 | 136 | 4352 | 34 | 16G GDDR7 | `GNb-34-16` | From 20a438af00fcdd98d1fa4c6acbf3d5c4d873f8d9 Mon Sep 17 00:00:00 2001 From: Kurt Garloff Date: Tue, 16 Jun 2026 12:56:59 +0200 Subject: [PATCH 3/4] Appease markdownlint. Signed-off-by: Kurt Garloff --- ...w1-flavor-naming-implementation-testing.md | 24 +++++++++---------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/Standards/scs-0100-w1-flavor-naming-implementation-testing.md b/Standards/scs-0100-w1-flavor-naming-implementation-testing.md index 7525dc544..e52fe1cf7 100644 --- a/Standards/scs-0100-w1-flavor-naming-implementation-testing.md +++ b/Standards/scs-0100-w1-flavor-naming-implementation-testing.md @@ -98,14 +98,14 @@ No MIG support, 128 Cuda Cores and 4 Tensor Cores per SM. | L40G | 568 | 18176 | 142 | 48G GDDR6 | `GNl-142h-48` | | L40S | 568 | 18176 | 142 | 48G GDDR6 | `GNl-142hh-48` | -| Nvidia GPU | Tensor C | Cuda Cores | SMs | VRAM | SCS name piece | -|--------------|----------|------------|-----|-----------|----------------| -| RTX2000 Ada | 88 | 2816 | 22 | 16G GDDR6 | `GNl-22-16` | -| RTX4000 Ada | 192 | 6144 | 48 | 20G GDDR6 | `GNl-48-20` | -| RTX4500 Ada | 240 | 7680 | 60 | 24G GDDR6 | `GNl-60-24` | -| RTX5000 Ada | 400 | 12800 | 100 | 32G GDDR6 | `GNl-100-32` | -| RTX5880 Ada | 440 | 14080 | 110 | 48G GDDR6 | `GNl-110-48` | -| RTX6000 Ada | 568 | 18176 | 142 | 48G GDDR6 | `GNl-142-48` | +| Nvidia GPU | Tensor C | Cuda Cores | SMs | VRAM | SCS name piece | +|-------------|----------|------------|-----|-----------|----------------| +| RTX2000 Ada | 88 | 2816 | 22 | 16G GDDR6 | `GNl-22-16` | +| RTX4000 Ada | 192 | 6144 | 48 | 20G GDDR6 | `GNl-48-20` | +| RTX4500 Ada | 240 | 7680 | 60 | 24G GDDR6 | `GNl-60-24` | +| RTX5000 Ada | 400 | 12800 | 100 | 32G GDDR6 | `GNl-100-32` | +| RTX5880 Ada | 440 | 14080 | 110 | 48G GDDR6 | `GNl-110-48` | +| RTX6000 Ada | 568 | 18176 | 142 | 48G GDDR6 | `GNl-142-48` | ##### Grace Hopper (`g`) @@ -157,7 +157,6 @@ regressed INT8 performance. | RTX Pro6000 Blackwell | 1/2 | 376 | 13032 | 94 | 48G GDDR7 | `GNb-94-48` | | RTX Pro6000 Blackwell | 1/4 | 188 | 6516 | 47 | 24G GDDR7 | `GNb-47-24` | - #### AMD Radeon (`A`) ##### CDNA 2 (`2`) @@ -193,7 +192,7 @@ Cores and 64 Stream Processors per CU. | Inst MI355X | 1024 | 16384 | 256 | 288G HBM3e | `GA4-256h-288h` | The Instinct MI355X has a higher watttage and thus slightly higher clocks -than the MI350X but is otherwise identical -- we can thus use the `h` modifier +than the MI350X but is otherwise identical - we can thus use the `h` modifier to identify the higher performance version. ##### Workstation RDNA 3 (`3.1`) and 4 (`4.1`) @@ -236,12 +235,11 @@ One EU has one tensor core and 16 shading units. #### Consumer cards -Note that we don't recommend using consumer cards. +Note that we don't recommend using consumer cards. That said, the schema allows to specify them and for example do PCI pass-through -of Nvidia RTX4080S (`GNl-80-16`), RTX4090 (`GNl-128-24`), RTX5080S (`GNb-84-24`), +of Nvidia RTX4080S (`GNl-80-16`), RTX4090 (`GNl-128-24`), RTX5080S (`GNb-84-24`), RTX5090 (`GNb-170-32`), or AMD Radeon RX7900XTX (`GA3.1-96-24`). - ## Automated tests The following testcases [are implemented](https://github.com/SovereignCloudStack/standards/tree/main/Tests/iaas/openstack_test.py): From 82065b76a433dfc686759fa7986d9a21e4bb93fa Mon Sep 17 00:00:00 2001 From: Kurt Garloff Date: Tue, 16 Jun 2026 19:55:01 +0200 Subject: [PATCH 4/4] Add Blackwell Ultra 'u' to the flavor naming script. Signed-off-by: Kurt Garloff --- Tests/iaas/scs_0100_flavor_naming/flavor_names.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Tests/iaas/scs_0100_flavor_naming/flavor_names.py b/Tests/iaas/scs_0100_flavor_naming/flavor_names.py index 9bb3e3eea..fbd9519fd 100644 --- a/Tests/iaas/scs_0100_flavor_naming/flavor_names.py +++ b/Tests/iaas/scs_0100_flavor_naming/flavor_names.py @@ -233,17 +233,17 @@ class GPU: brand = TblAttr("Brand", {"N": "Nvidia", "A": "AMD", "I": "Intel"}) gen = DepTblAttr("Gen", brand, { "N": {'': '(unspecified)', "f": "Fermi", "k": "Kepler", "m": "Maxwell", "p": "Pascal", "v": "Volta", - "t": "Turing", "a": "Ampere", "l": "AdaLovelace", "g": "GraceHopper", "b": "Blackwell"}, + "t": "Turing", "a": "Ampere", "l": "AdaLovelace", "g": "GraceHopper", "b": "Blackwell", "u": "Blackwell Ultra"}, "A": {'': '(unspecified)', "0.4": "GCN4.0/Polaris", "0.5": "GCN5.0/Vega", "1": "CDNA1", "1.1": "RDNA1/Navi1x", "2": "CDNA2", "2.1": "RDNA2/Navi2x", "3": "CDNA3", "3.1": "RDNA3/Navi3x", "3.5": "RDNA3.5", "4": "CDNA4", - "4.1": "RDNA-4/Navi4x", "5.1": "RDNA-5/Navi5x"}, + "4.1": "RDNA4/Navi4x", "5.1": "RDNA5/Navi5x"}, "I": {'': '(unspecified)', "0.9": "Gen9/Skylake", "0.95": "Gen9.5/KabyLake", "1": "Xe1/Gen12.1/DG1", "2": "Xe2/Gen12.2", "3": "Arc/Gen12.7/DG2", "4": "BattleImage/Gen20.0"}, }) cu = OptIntAttr("#.N:SMs/A:CUs/I:EUs") perf = TblAttr("Frequency", {"": "Std Freq", "h": "High Freq", "hh": "Very High Freq"}) vram = OptIntAttr("#.V:GiB VRAM") - vramperf = TblAttr("Bandwidth", {"": "Std BW {<~1GiB/s)", "h": "High BW", "hh": "Very High BW"}) + vramperf = TblAttr("Bandwidth", {"": "Std BW (GDDR)", "h": "High BW (HBM)", "hh": "Very High BW"}) def __init__(self, gputype="g", brand="N", gen='', cu=None, perf='', vram=None, vramperf=''): self.gputype = gputype