From 3fe2a9839bd7d80fdbdf68b41785fd8f45c64476 Mon Sep 17 00:00:00 2001 From: stackedsax Date: Sun, 24 May 2026 13:39:50 -0700 Subject: [PATCH] =?UTF-8?q?Batch=20Subproject:=20add=20meeting=20notes=20(?= =?UTF-8?q?2022=E2=80=932026),=20charter,=20and=20initiative=20dirs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: stackedsax --- .../subprojects/batch/charter.md | 34 ++++- .../batch/initiative_benchmarking/README.md | 7 + .../batch/meeting-notes/2022-06-06.md | 58 ++++++++ .../batch/meeting-notes/2022-06-20.md | 14 ++ .../batch/meeting-notes/2022-07-18.md | 58 ++++++++ .../batch/meeting-notes/2022-08-01.md | 22 +++ .../batch/meeting-notes/2022-08-15.md | 13 ++ .../batch/meeting-notes/2022-09-26.md | 17 +++ .../batch/meeting-notes/2022-10-10.md | 16 +++ .../batch/meeting-notes/2022-11-07.md | 17 +++ .../batch/meeting-notes/2022-11-21.md | 5 + .../batch/meeting-notes/2022-12-19.md | 13 ++ .../batch/meeting-notes/2023-01-30.md | 8 ++ .../batch/meeting-notes/2023-02-13.md | 5 + .../batch/meeting-notes/2023-02-27.md | 5 + .../batch/meeting-notes/2023-03-13.md | 9 ++ .../batch/meeting-notes/2023-03-27.md | 11 ++ .../batch/meeting-notes/2023-04-10.md | 11 ++ .../batch/meeting-notes/2023-04-24.md | 8 ++ .../batch/meeting-notes/2023-05-08.md | 14 ++ .../batch/meeting-notes/2023-05-22.md | 17 +++ .../batch/meeting-notes/2023-09-11.md | 14 ++ .../batch/meeting-notes/2023-10-09.md | 11 ++ .../batch/meeting-notes/2024-07-01.md | 24 ++++ .../batch/meeting-notes/2024-11-18.md | 45 ++++++ .../batch/meeting-notes/2024-12-02.md | 40 ++++++ .../batch/meeting-notes/2024-12-16.md | 36 +++++ .../batch/meeting-notes/2025-01-13.md | 29 ++++ .../batch/meeting-notes/2025-01-27.md | 86 +++++++++++ .../batch/meeting-notes/2025-02-11.md | 84 +++++++++++ .../batch/meeting-notes/2025-02-25.md | 51 +++++++ .../batch/meeting-notes/2025-03-11.md | 59 ++++++++ .../batch/meeting-notes/2025-03-25.md | 48 +++++++ .../batch/meeting-notes/2025-04-08.md | 76 ++++++++++ .../batch/meeting-notes/2025-04-22.md | 59 ++++++++ .../batch/meeting-notes/2025-05-06.md | 67 +++++++++ .../batch/meeting-notes/2025-05-20.md | 72 ++++++++++ .../batch/meeting-notes/2025-06-03.md | 81 +++++++++++ .../batch/meeting-notes/2025-06-17.md | 63 ++++++++ .../batch/meeting-notes/2025-07-01.md | 46 ++++++ .../batch/meeting-notes/2025-07-15.md | 53 +++++++ .../batch/meeting-notes/2025-07-29.md | 136 ++++++++++++++++++ .../batch/meeting-notes/2025-08-12.md | 111 ++++++++++++++ .../batch/meeting-notes/2025-08-26.md | 107 ++++++++++++++ .../batch/meeting-notes/2025-09-09.md | 78 ++++++++++ .../batch/meeting-notes/2025-09-23.md | 82 +++++++++++ .../batch/meeting-notes/2025-10-07.md | 79 ++++++++++ .../batch/meeting-notes/2025-10-21.md | 59 ++++++++ .../batch/meeting-notes/2025-11-18.md | 62 ++++++++ .../batch/meeting-notes/2025-12-02.md | 72 ++++++++++ .../batch/meeting-notes/2026-01-13.md | 20 +++ .../batch/meeting-notes/2026-01-27.md | 32 +++++ .../batch/meeting-notes/2026-02-10.md | 68 +++++++++ .../batch/meeting-notes/2026-02-24.md | 79 ++++++++++ .../batch/meeting-notes/2026-03-10.md | 66 +++++++++ .../batch/meeting-notes/2026-04-07.md | 54 +++++++ .../batch/meeting-notes/2026-04-21.md | 73 ++++++++++ .../batch/meeting-notes/2026-05-05.md | 70 +++++++++ .../batch/meeting-notes/2026-05-19.md | 63 ++++++++ .../batch/meeting-notes/2026-06-02.md | 13 ++ .../subprojects/batch/meeting-notes/README.md | 80 +++++++++++ 61 files changed, 2809 insertions(+), 1 deletion(-) create mode 100644 tags/tag-workloads-foundation/subprojects/batch/initiative_benchmarking/README.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-06-06.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-06-20.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-07-18.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-08-01.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-08-15.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-09-26.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-10-10.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-11-07.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-11-21.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-12-19.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-01-30.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-02-13.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-02-27.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-03-13.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-03-27.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-04-10.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-04-24.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-05-08.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-05-22.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-09-11.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-10-09.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2024-07-01.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2024-11-18.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2024-12-02.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2024-12-16.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-01-13.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-01-27.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-02-11.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-02-25.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-03-11.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-03-25.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-04-08.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-04-22.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-05-06.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-05-20.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-06-03.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-06-17.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-07-01.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-07-15.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-07-29.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-08-12.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-08-26.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-09-09.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-09-23.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-10-07.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-10-21.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-11-18.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-12-02.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-01-13.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-01-27.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-02-10.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-02-24.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-03-10.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-04-07.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-04-21.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-05-05.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-05-19.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-06-02.md create mode 100644 tags/tag-workloads-foundation/subprojects/batch/meeting-notes/README.md diff --git a/tags/tag-workloads-foundation/subprojects/batch/charter.md b/tags/tag-workloads-foundation/subprojects/batch/charter.md index b1d2c585e..f011c90f5 100644 --- a/tags/tag-workloads-foundation/subprojects/batch/charter.md +++ b/tags/tag-workloads-foundation/subprojects/batch/charter.md @@ -1 +1,33 @@ -Charter content here \ No newline at end of file +# TAG Workloads Foundation Batch Subproject Charter + +## Mission + +The cloud-native batch scheduling ecosystem is fragmented — different projects tackle job scheduling, queueing, and resource management in incompatible ways. The Batch subproject brings together maintainers and users across the ecosystem to reduce that fragmentation: aligning on common Kubernetes APIs and primitives, developing best practices, and improving outcomes for batch workloads — whether HPC, AI/ML, data analytics, or CI — in cloud-native environments. + +## Scope + +### In Scope + +To reduce fragmentation in the Kubernetes batch ecosystem: congregate leads and users from different external and internal projects and user groups (CNCF TAGs, Kubernetes sub-projects focused on batch-related features such as topology-aware scheduling) in the batch ecosystem to gather requirements, validate designs and encourage reutilization of core Kubernetes APIs. + +The following recommendations for enhancements: + +* Additions to the batch API group, currently including Job and CronJob resources that benefit batch use cases such as HPC, AI/ML, data analytics and CI. +* Primitives for job-level queueing, not limited to the Kubernetes Job resource. Long-term, this could include multi-cluster support. +* Primitives to control and maximize utilization of resources in fixed-size clusters (on-prem) and elastic clusters (cloud). +* Benchmarking models for Batch systems +* Data Locality +* User Stories +* Scheduling support for specialized hardware (Accelerators, NUMA, Networking, etc.) + +### Out of Scope + +* Addition of new API kinds that serve a specialized type of workload. The focus should be on general APIs that specialized controllers can build on top of. +* Uses of the batch APIs as support for serving workloads (eg. backups, upgrades, migrations). These can be served by existing SIGs. +* Proposals that duplicate the functionality of core Kubernetes components (job-controller, kube-scheduler, cluster-autoscaler). +* Job workflows or pipelines. Mature third party frameworks serve these use cases with the current Kubernetes primitives. But additional primitives to support these frameworks could be in scope. + +## Deliverables + +* **Project Landscape** — a living catalogue of batch scheduling projects in the cloud-native ecosystem, maintained at [bsi-landscape.netlify.app](https://bsi-landscape.netlify.app/). +* **Whitepapers and Technical Research** — the subproject produces papers and research on topics relevant to cloud-native batch scheduling, such as benchmarking of batch systems, data locality, scheduling best practices, and user stories. An initial series of five whitepapers is complete, with more planned as the space evolves. diff --git a/tags/tag-workloads-foundation/subprojects/batch/initiative_benchmarking/README.md b/tags/tag-workloads-foundation/subprojects/batch/initiative_benchmarking/README.md new file mode 100644 index 000000000..40ec197c3 --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/initiative_benchmarking/README.md @@ -0,0 +1,7 @@ +# Batch Subproject — Benchmarking Initiative + +This directory contains work from the benchmarking initiative of the [CNCF Batch Subproject](https://tag-workloads-foundation.cncf.io/batch/). + +## Overview + +The benchmarking initiative develops models, methodologies, and tools for evaluating and comparing batch scheduling systems in cloud-native environments. diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-06-06.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-06-06.md new file mode 100644 index 000000000..2720e4880 --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-06-06.md @@ -0,0 +1,58 @@ +# 📅 Jun 6, 2022 + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Kevin Hannon (G Research Open Source) +- Dave Gantenbein (G Research) +- Jonathan Skone (LBNL NERSC) +- Diana Arroyo (IBM) +- Nathan Rinni (SchedMD) + +## 📋 Agenda + +- Introductions and agenda setting + +## Discussion Notes + +Introductions and Interests: +Participants introduced themselves and shared their backgrounds and interests in batch processing within a cloud-native framework. +Alex Scammon (G-Research): Leads open-source initiatives; keen on multi-cluster batch scheduling and bridging gaps between infrastructure teams and researchers. +Jonathan Skone (Lawrence Berkeley National Lab): Focuses on emerging technologies to prepare for next-generation system procurement, balancing traditional HPC workflows with cloud-native approaches. +Diana Arroyo (IBM Research): Works on Multi-Cluster App Dispatcher (MCAD) and batch scheduling challenges in multi-cluster and hybrid environments. +Nathan Rini (SchedMD): Maintains SLURM batch scheduler; aims to enhance SLURM’s container support and explore integration with Kubernetes. +Kevin Hannon and Dave Gantenbein (G-Research): Contributors to Armada and related Open Source projects focused on multi-cluster scheduling. +Directive for the Working Group: +Discussion centered on defining the unique role of this WG compared to other groups, such as the Kubernetes Batch WG and the CNCF Research End User Group. +Agreement to focus on: +Identifying and solving key issues in batch processing for Kubernetes and derivatives. +Bridging the divide between infrastructure teams and end users (researchers). +White Paper Development: +Plan to write a white paper addressing: +Key problems with batch processing in Kubernetes and derivatives. +Assessment of tools and solutions available in the ecosystem. +The cultural and operational divide between end users and infrastructure teams. +The white paper will also serve as a baseline for identifying future work and proof-of-concept projects for the WG. +Survey Development: +Proposal to create and distribute a survey to: +Gather insights from end users and infrastructure teams on their batch processing needs and challenges. +Identify common barriers and misalignments between these groups. +Alex Scammon and Diana Arroyo volunteered to collaborate on designing the survey. +Participation Expansion: +Emphasis on inviting broader participation in the WG by reaching out to relevant stakeholders. Suggestions included: +Klaus Ma (Volcano) +Participants from San Diego Supercomputing Center and Google. +Representatives from vendors like IBM/Red Hat and Cray/HPE. +Meeting Time Adjustment: +Proposal to move meeting time from 7:30 AM PDT to 8:00 AM PDT to better accommodate participants. +Alex will confirm with Volcano participants (Asia-based) regarding their intentions to join and the feasibility of the time adjustment. +Other Key Discussion Points: +Cultural Divide: +Agreement that researchers often resist change and prefer established tools (e.g., SLURM), whereas infrastructure teams focus on long-term maintainability and scalability (e.g., Kubernetes). +Challenge of creating solutions that balance these competing priorities while improving usability and flexibility. +Infrastructure Observations: +Jonathan described NERSC’s move to Kubernetes for managing control planes and enterprise services, but noted that only ~25% of workloads are containerized. +Nathan highlighted that SLURM remains widely used, with some researchers embedding it into Kubernetes clusters to maintain traditional workflows. +Potential Collaboration Areas: +Standardizing workflows across schedulers and reducing fragmentation in Kubernetes batch solutions (e.g., Armada, Volcano, MCAD). +Exploring proof-of-concept projects after the white paper to develop unified approaches or tooling for batch processing in cloud-native environments. diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-06-20.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-06-20.md new file mode 100644 index 000000000..6b61425ba --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-06-20.md @@ -0,0 +1,14 @@ +# 📅 Jun 20, 2022 + +## 👥 Attendees + +- Alex Scammon (G Research) +- Diana Arroyo (IBM) +- Nathan Rinni (SchedMD) +- C. Rindi (G Research) +- Klaus Ma (Nvidia) +- Jonathan Skone (LBNL NERSC) + +## Discussion Notes + +Batch Survey diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-07-18.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-07-18.md new file mode 100644 index 000000000..c108876e7 --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-07-18.md @@ -0,0 +1,58 @@ +# 📅 Jul 18, 2022 + +## 👥 Attendees + +- Alex Scammon (G-Research) +- Kevin Hannon (G-Research) +- Diana Arroyo (IBM) +- Jonathan Skone (NERSC LBNL) +- Weiwei Yang (Apple) + +## 📋 Agenda + +- Updates: +- Cleanup of our docs +- Applied to have a Cloud Native Community Group presence +- Go over the latest Batch Survey ideas +- Discuss the “working” of the this working group +- Kevin: understands k8s batch working group; unsure of research user group +- Jamie: +- Research user group is CNCF tech for research institutions +- Talks about batch a lot, but not exclusively. Notebooks; security implications +- Mostly focused around k8s but it’s focused on the type of person invited. Generally from Academia or other “research” institutions +- Jonathan: +- Thought k8s batch was going to be focused on working on fundamental improvements to k8s for batch +- K8s batch has started to include more education/information sharing +- Someone spoke on Slurm w/ k8s glasses on at the last k8s batch +- Diana: +- K8s batch started on focusing on Kueue +- As discussions evolved, wanted to identify patterns +- K8s batch currently surveying the landscape to move forward on the lower level improvements +- Kevin: +- Agreed with Diana: understanding the use-cases and common problems so that the underlying architecture improvements will be correct +- Diana: +- Alex: +- Back to the original question: what “work” are ew up for in this working group? +- Kevin: +- Feels that k8s isn’t in the toolbox for researchers and HPC community +- Maybe focus on “old-world” schedulers +- Reach out to the non-k8s community +- From work at NIH, maybe reaching out to bioinformatics community +- Like Galaxy community +- CubeGene on Volcano, perhaps? +- Diana: +- Good idea to take a specific feature of batch schedulers and see how that feature differs +- Jonathan: +- Likes the idea for this group to phase itself out over time +- Not worry too much about creating tasks up front; let the conversation evolve +- Be a beacon of recommendations for batch in CNCF +- Wei: +- Finding the discussion helpful; looking for clarity around all these working groups +- More Outreach ideas +- Alex: +- Kevin, give me names from bioinformatics, etc. +- Jonathan: +- Has access to researchers; has a broad task to understand their needs +- Target the High-Energy physicists community? Perhaps Atlas group? +- Target infra side of the house in research +- Alex: Maybe ask Ricardo to send some folks? diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-08-01.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-08-01.md new file mode 100644 index 000000000..4decc748a --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-08-01.md @@ -0,0 +1,22 @@ +# 📅 Aug 1, 2022 + +## 👥 Attendees + +- Alex Scammon (G-Research) +- Jorge Vargas (Thermo Fisher Scientific) +- Diana Arroyo (IBM) +- Kevin Hannon (G-Research) +- Abhishek Malvankar (IBM) +- Michel Sumbul (G-Research) +- Dave Gantenbein (G-Research) +- Jonathan Skone (NERSC) + +## 📋 Agenda + +- News and updates +- Armada Sandboxed +- Survey Update +- Targeting Infra Community +- Academic services +- Sidecar services: +- Infra groups that set up long-lived enterprisey services diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-08-15.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-08-15.md new file mode 100644 index 000000000..6f3830d9d --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-08-15.md @@ -0,0 +1,13 @@ +# 📅 Aug 15, 2022 + +## 👥 Attendees + +- Abhishek Malvankar (IBM) +- Alex Scammon (GR Open Source) +- Dave Gantenbein (GR Open Source) +- Diana Arroyo (IBM) + +## Discussion Notes + +Survey update, CNCF access +more diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-09-26.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-09-26.md new file mode 100644 index 000000000..c6fc05acd --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-09-26.md @@ -0,0 +1,17 @@ +# 📅 Sep 26, 2022 + +## 👥 Attendees + +- Alex Scammon (G-Research) +- Kevin Hannon (G-Research) +- Abhishek Malvankar (IBM) +- Dave Gantenbein (G-Research) +- Jonathan Skone (NERSC) + +## 📋 Agenda + +- Ideas from Kubecon for potential presenters? +- Kevin: How do HPC schedulers support cloud? +- https://community.cncf.io/events/details/cncf-research-end-user-group-presents-cncf-research-end-user-group-oci-containers-with-scrun-nate-rini-schedmd-2022-10-05/ +- Nate: +- https://www.youtube.com/watch?v=7y7IpCTj5mk&t=1s diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-10-10.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-10-10.md new file mode 100644 index 000000000..fb009bc31 --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-10-10.md @@ -0,0 +1,16 @@ +# 📅 Oct 10, 2022 + +## 👥 Attendees + +- Kevin Hannon (G-Research) +- Nathan Rini (SchedMd) +- Alex Scammon (G-Research) +- Dave Gantenbein (G-Research) +- Abhishek Malvankar (IBM) +- Jonathan Skone (NERSC) + +## 📋 Agenda + +- Survey Updates +- Need more options and help +- Nathan Rini: Slurm Container Presentation diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-11-07.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-11-07.md new file mode 100644 index 000000000..dcf2d9838 --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-11-07.md @@ -0,0 +1,17 @@ +# 📅 Nov 7, 2022 + +## 👥 Attendees + +- (G-Research Open Source) +- (G-Research Open Source) +- Jonathan Skone (NERSC) +- Alex Scammon (G-Research Open Source) +- Nate Rini (SchedMD) + +## Discussion Notes + +Kevin presents “Interactive Jobs in Kubernetes” +https://docs.google.com/document/d/1-kiduaazR9-04_pcoUJE4zECNVYnar0RtfJezUBBwzM/edit# +Batch WG Mandate +Survey Updates +Other Kubecon Takeaways diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-11-21.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-11-21.md new file mode 100644 index 000000000..1e78c5d47 --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-11-21.md @@ -0,0 +1,5 @@ +# 📅 Nov 21, 2022 + +## Discussion Notes + +(Notes forthcoming) diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-12-19.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-12-19.md new file mode 100644 index 000000000..e2a2127b7 --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2022-12-19.md @@ -0,0 +1,13 @@ +# 📅 Dec 19, 2022 + +## 👥 Attendees + +- Kevin Hannon (G-Research Open Source) +- Alex Scammon (G-Research Open Source) +- Jonathan Skone (NERSC) +- Nate Rini (SchedMD) + +## Discussion Notes + +(Cats) +Hardware, roadmaps, Kubernetes diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-01-30.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-01-30.md new file mode 100644 index 000000000..ac612c89c --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-01-30.md @@ -0,0 +1,8 @@ +# 📅 Jan 30, 2023 + +## 👥 Attendees + +- Nate Rini (SchedMD) +- Kevin Hannon (G-Research Open Source) +- Alex Scammon (G-Research Open Source) +- Diana Arroyo (IBM) diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-02-13.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-02-13.md new file mode 100644 index 000000000..5753f80cd --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-02-13.md @@ -0,0 +1,5 @@ +# 📅 Feb 13, 2023 + +## Discussion Notes + +(notes forthcoming) diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-02-27.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-02-27.md new file mode 100644 index 000000000..1144c338b --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-02-27.md @@ -0,0 +1,5 @@ +# 📅 Feb 27, 2023 + +## Discussion Notes + +(notes forthcoming) diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-03-13.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-03-13.md new file mode 100644 index 000000000..258dbf1df --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-03-13.md @@ -0,0 +1,9 @@ +# 📅 Mar 13, 2023 + +## 👥 Attendees + +- Alex Scammon (GR Open Source) +- Tim Middelkoop (Internet2) +- Nate Rini (SchedMD) +- Jonathan Skone (NERSC) +- Dave Gantenbein (GR Open Source) diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-03-27.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-03-27.md new file mode 100644 index 000000000..0a841408b --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-03-27.md @@ -0,0 +1,11 @@ +# 📅 Mar 27, 2023 + +## 👥 Attendees + +- Alex Scammon +- Caterina Rindi +- Diana Arroyo +- Jonathan Skone +- Kevin Hannon +- Nate Rini +- Matthew West diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-04-10.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-04-10.md new file mode 100644 index 000000000..bfd255ffe --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-04-10.md @@ -0,0 +1,11 @@ +# 📅 Apr 10, 2023 + +## 👥 Attendees + +- Kevin Hannon (G-Research Open Source) +- Caterina Rindi (G-Research Open Source) +- Alex Scammon (G-Research Open Source) +- Nate Rini (SchedMD) +- Jonathan Skone (NERSC) +- Diana Arroyo (IBM) +- Dave Gantenbein (G-Research Open Source) diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-04-24.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-04-24.md new file mode 100644 index 000000000..1a4d33abb --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-04-24.md @@ -0,0 +1,8 @@ +# 📅 Apr 24, 2023 + +## 👥 Attendees + +- Jonathan Skone (NERSC) +- Nate Rini (SchedMD) +- Diana Arroyo (IBM) +- Caterina Rindi (G-Research Open Source) diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-05-08.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-05-08.md new file mode 100644 index 000000000..bbac60b2c --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-05-08.md @@ -0,0 +1,14 @@ +# 📅 May 8, 2023 + +## 👥 Attendees + +- Kevin Hannon (G-Research Open Source) +- Jonathan Skone (NERSC) +- Dianna Arroyo (IBM) +- Alex Scammon (G-Research Open Source) +- Dave Ganteinbeim (G-Research Open Source) + +## 📋 Agenda + +- KubeCon-EU debrief +- NERSC RFP (https://www.nersc.gov/systems/nersc-10/draft-tech-req/) diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-05-22.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-05-22.md new file mode 100644 index 000000000..48e931efd --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-05-22.md @@ -0,0 +1,17 @@ +# 📅 May 22, 2023 + +## 👥 Attendees + +- Alex Scammon (G-Research Open Source) +- Caterina Rindi (G-Research Open Source) +- Nate Rini (SchedMD) +- Jonathan Skone (NERSC) +- Matthew West + +## 📋 Agenda + +- Batch Landscape Discussions +- Three Possible Landscape Venues: +- https://github.com/lfai/lfai-landscape +- https://github.com/cncf/landscape +- https://github.com/cncf/tag-runtime/tree/main diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-09-11.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-09-11.md new file mode 100644 index 000000000..5418771cb --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-09-11.md @@ -0,0 +1,14 @@ +# 📅 Sep 11, 2023 + +## 👥 Attendees + +- Alex Scammon (G-Research) + +## Discussion Notes + +Housecleaning +Fixed calendar invites on: +https://docs.google.com/document/d/1mIhSTOa5bQWHY9oAIopzpy1w-hACm_ewe7fzMZ30Z6c/edit +https://docs.google.com/document/d/1GuZGyBkRGG0lEeiPA8q0PfvFlwUlwa5k-ZfXafCTdBY/edit#heading=h.63y814c3aujl +https://github.com/cncf/tag-runtime/blob/main/wg/bsi.md +https://github.com/cncf/tag-runtime/pull/76 diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-10-09.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-10-09.md new file mode 100644 index 000000000..fb3ca7364 --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2023-10-09.md @@ -0,0 +1,11 @@ +# 📅 Oct 9, 2023 + +## 👥 Attendees + +- Alex Scammon (G-Research) +- Mehdi Nassim Khodja +- Klaus Ma (Nvidia) + +## 📋 Agenda + +- Demo Batch Scheduling landscape for CNCF (Mehdi) diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2024-07-01.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2024-07-01.md new file mode 100644 index 000000000..ba910bc9b --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2024-07-01.md @@ -0,0 +1,24 @@ +# 📅 Jul 1, 2024 + +## 👥 Attendees + +- Alex Scammon (G-Research) +- Eduardo Arango (Nvidia) +- Abhishek (IBM) +- Adam McArthur (G-Research) +- Jonathan Skone (NERSC) +- Marlow Warnicke (SchedMD) +- Nathan Rini (SchedMD) +- Priyanka Sharma (CNCF) +- Skyler Malinowski (SchedMD) +- Tim Wickberg (SchedMD) +- Timothy Middelkoop (Internet2) +- Elias Quintero + +## 📋 Agenda + +- Slurm + Kubernetes Discussion (Eduardo leading) +- Logistics: +- Welcome Abhishek +- ARCOS Symposium Talk +- New APAC Discussion Time Added diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2024-11-18.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2024-11-18.md new file mode 100644 index 000000000..647f01f4a --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2024-11-18.md @@ -0,0 +1,45 @@ +# 📅 Nov 18, 2024 + +## 👥 Attendees + +- Alex Scammon (Head of Open Source Software, G-Research) [host] +- Abhishek Malvankar (IBM) [host] +- Ashton Graves (University of Nebraska, Lincoln, HPC center) +- Pradeep Madhavarapu (Distinguished Engineer, Nvidia, Cloud Storage and Data Management) +- Larry Rudolph (Two Sigma, Research group & MIT professor) +- Marlow Warnicke (Sketmd, Slurm & Kubernetes projects) +- Marty Ford (Insight Softmax, HPC infrastructure and platform engineering) +- Jonathan Skone (NERSC) + +## 📋 Agenda + +- “Overcoming Data Gravity” or “Schedulers + Storage” +- Or: how to place data with compute or move compute to the data, especially in multi-region environments? +- Slinky - SchedMD’s project for Slurm on Kubernetes +- Speaker: Marlowe + +## Discussion Notes + +Introductions and Context: +Participants introduced themselves, highlighting their roles and interest in HPC, scheduling, and storage. +Discussed the need to improve collaboration between schedulers and storage solutions to solve data-compute proximity issues. +Challenges Identified: +Manual Data Placement: Many organizations rely on manual tagging for scheduling jobs near data, a suboptimal and inefficient process. +Scale and Cost: Solutions vary widely depending on data size and organizational constraints. +Data Lineage and Provenance: A standardized way to describe data dependencies and lineage across systems is lacking. +Caching vs. Prefetching: Clarified that caching only works when data is reused, whereas prefetching is necessary for one-off jobs. +Communication Gap: There is little interaction between storage layers and schedulers, leading to inefficiencies. +Key Discussion Points: +Storage-Scheduler Interaction: Proposed that schedulers publish plans (e.g., data dependencies) to storage systems to enable prefetching and optimization. +D3N/D4N Models: Larry Rudolph discussed their potential to bridge data lineage and scheduling gaps. A deeper dive was planned for the next meeting. +Benchmarks: Highlighted the lack of suitable benchmarks for testing scheduler efficiency in complex, multi-region, and cloud-native workloads. +Cost Modeling: Emphasized the need to integrate cost factors (e.g., network charges, replication expenses) into scheduling and storage decisions. +Proposals and Next Steps: +Larry Rudolph to present on D3N/D4N approaches at the next meeting. +Expand outreach to storage vendors (e.g., VAST, WEKA, HammerSpace, Alexio) for their input on this challenge. +Begin exploring benchmarks and workload definitions for complex, multi-region batch scheduling. +Miscellaneous: +Published a landscape document categorizing batch schedulers, aiming to prevent redundancy and foster collaboration. +Discussed potential future alignment on metadata standards and open-source contributions to solve identified issues. +Storage options in cloud native context +Accelerator sharing modes used in the community diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2024-12-02.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2024-12-02.md new file mode 100644 index 000000000..fe5d672d4 --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2024-12-02.md @@ -0,0 +1,40 @@ +# 📅 Dec 2, 2024 + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Abhishek Malvankar (IBM) [host] +- Marlow Warnicke (SchedMD) +- Larry Rudolph (Independent, formerly Two Sigma, MIT) +- Ali Abbas Jaffri (Synthesia, Munich, Germany) +- Victor Lu (Independent, Tampa, Florida) +- Pradeep Madhavarapu (Nvidia) +- Ebin Babu (Stackgenie) +- Mehdi Nassim Khodja (G-Research) +- Shyam (Argonne National Laboratory, ALCF) + +## 📋 Agenda + +- Larry Rudolph (MIT) to present a discussion around d3n +- https://ieeexplore.ieee.org/document/9006396 + +## Discussion Notes + +Introductions +Welcome new attendees and brief self-introductions (e.g., Ali Abbas Jaffri, Victor Lu, Ebin Babu, Shyam). +Reiterate the purpose of the group and its focus areas. +Technical Presentations and Discussions +Larry Rudolph's Presentation on D3N and D4N: +Overview of data sharing and scheduling challenges in modern systems. +Evolution of data center architectures, caching strategies, and cooperative caching concepts. +Applications of DAG-based workflows and their implications for scheduling and storage. +Feedback and Observations +Explore the implications of data lineage, provenance, and caching strategies for batch schedulers. +Considerations around hyper-converged systems and distributed caching in multi-cluster environments (e.g., insights from Nvidia’s internal workloads). +Landscape Updates +Updates on the CNCF Batch and Scheduling Landscape: +Acknowledgment of new additions (e.g., Slinky, Godal Scheduler). +Challenges of accommodating an increasing number of batch schedulers. +Future Planning +Set the stage for upcoming discussions in the new year (e.g., data lineage and provenance). +Decision to skip the December 16th meeting and resume in January with a focus on cross-vertical data lineage and batch scheduler integration. diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2024-12-16.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2024-12-16.md new file mode 100644 index 000000000..16ad4334d --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2024-12-16.md @@ -0,0 +1,36 @@ +# 📅 Dec 16, 2024 + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Larry Rudolph (MIT) +- Marlow Warnicke (SchedMD) +- Mehdi Khodja (G-Research) +- Pradeep Madhavarapu (Nvidia) +- Panchapakesan (Shyam) Shyamshankar +- Victor Lu +- Marcel Lauhoff (clyso) + +## 📋 Agenda + +- Review of previous meetings +- Discussion on Starburst initiative and its recent updates. +- Updates and plans for the CNCF Batch Working Group in 2025 + +## Discussion Notes + +Documentation and Planning +Marlow Warnicke emphasized the need for consistent note-taking and highlighted missed documentation from the previous meeting. +Discussions included structuring the group's agenda for 2024, including key topics and deliverables. +Technical Discussions +Alex Scammon highlighted the Starburst project and its ties to previous discussions on Ceph-related technologies. +Mehdi Khodja pointed to the CNCF TAG-Runtime GitHub repository as a resource for ongoing work. +Integration Challenges +Shyam raised concerns about integrating traditional HPC scheduling with Kubernetes-based cloud batch systems, focusing on workload migration and structuring. +Victor Lu discussed hybrid workloads and the challenges of running batch and non-batch jobs within the same clusters. He suggested creating white papers or blogs to guide best practices. +Scheduling and Scalability +Larry Rudolph emphasized the importance of benchmarking for batch systems and proposed it as a key deliverable for the upcoming year. +Marlow Warnicke discussed limitations in the current scheduling frameworks, particularly around scalability and plugin-based scheduling approaches. +Opportunities for Deliverables +The group identified opportunities to produce white papers, conduct surveys, and provide guidance on best practices for batch and hybrid workloads. +Victor Lu suggested creating content to clarify when and how to co-locate workloads in the same clusters. diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-01-13.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-01-13.md new file mode 100644 index 000000000..e55d3d268 --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-01-13.md @@ -0,0 +1,29 @@ +# 📅 Jan 13, 2025 + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Abhishek Malvankar (IBM) [host] +- Panchapakesan (Shyam) Shyamshankar [host] +- Marlow Warnicke (SchedMD) +- Andrew Senetar (CoreWeave) +- Marcel lauhoff (clyso) +- Simon Forster (Independent) +- Tommy Aldo Sonin (ISC - Insight Softmax) +- Pradeep Madhavarapu (NVIDIA) + +## 📋 Agenda + +- Data Locality, Data Lineage and Batch Scheduling +- Premise: Organizations want location-aware batch scheduling +- Question #1: How much are we losing by not doing it? +- Question #2: What would a scheduler need in order to do a better job of scheduling? +- Question #3: Where would that data/metadata come from? +- Question #4: After all this, will it be worth it? +- What next? What can we do to push this conversation forward? +- Links and resources: +- https://asdf.readthedocs.io/en/latest/ +- https://www.w3.org/TR/prov-o/ +- https://openlineage.io/ +- https://github.com/rucio/rucio +- https://github.com/OpenAssetIO/OpenAssetIO diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-01-27.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-01-27.md new file mode 100644 index 000000000..36ef56dca --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-01-27.md @@ -0,0 +1,86 @@ +# 📅 Jan 27, 2025 + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Abhishek Malvankar (IBM) [host] +- Marlow Warnicke (SchedMD) +- Pradeep Madhavarapu (NVIDIA) +- Andrew Senetar (CoreWeave) +- Boris Litvin (AWS) +- Skyler Malinowski (SchedMD) +- Timothy Middelkoop (Internet2) +- Tommy Aldo Sonin (ISC, aka Insight Softmax) +- Alex Scammon (G-Research) +- Shyam A. (Red Hat) +- Larry Rudolph (Independent, formerly Two Sigma) +- Andrew Mc (Run.ai/NVIDIA) + +## 📋 Agenda + +- Housecleaning for the year: +- Setting the intention for the group in 2025 +- Lining up the next set of talks +- Meeting timing: move to 9am? +- Continuation of Jan 13th discussion: collaboration on technical approach to data locality +- AI-Summarized Notes: +- Housekeeping & Scheduling Adjustments +- Proposed moving the meeting from Monday at 7:30 AM PDT to Tuesday at 8:00 AM PDT to accommodate better participation. +- Confirmation pending input from other members on Slack. +- Setting Intentions for 2024 +- Need to establish goals and structured initiatives instead of only freeform discussions. +- Possible focus areas: +- Understanding batch workloads in cloud-native environments. +- Storage & networking challenges in batch workloads. +- Benchmarking batch performance across schedulers, storage, and networking. +- Workflows & DAGs – exploring how workloads interact and data dependencies. +- Data locality & scheduling – Can schedulers be data-aware? +- Exploring multi-node distributed workloads (e.g., GPUs in batch). +- Surveying the Landscape of Batch Computing +- Goal: Understand the current workflows, tools, and pain points from a wider audience. +- Two-step approach: +- Internal survey: Gather input from this group to identify common patterns and challenges. +- External survey: Broader industry outreach to collect real-world data. +- Concerns: +- The diversity of workflows (every organization does batch differently). +- The lack of standardization in workflow tools (Airflow, Dask, Ray, in-house tools). +- Benchmarking Initiative +- Need standardized benchmarks to evaluate batch systems. +- Challenges: +- Users don't know how to run benchmarks correctly or normalize variables. +- Benchmarks must capture real-world complexity (e.g., ETL pipelines, ML training, large-scale simulations). +- Hybrid cloud benchmarking – How does scheduling across cloud providers impact performance & cost? +- Initial proposal: +- Define benchmark workloads. +- Identify key metrics (cost, latency, scheduling efficiency, etc.). +- Publish blog series or whitepapers on findings. +- Data Locality & Scheduling +- Key challenge: Compute scheduling today is not data-aware. +- Need schedulers that: +- Understand where data is located. +- Optimize placement to minimize movement costs. +- Adapt based on real-time cluster constraints. +- Open questions: +- Can data lineage frameworks (e.g., OpenLineage) help schedulers make decisions? +- What existing standards (Terraform, cloud APIs) can describe infrastructure dynamically? +- How do we measure macro vs. micro-level optimizations in scheduling? +- Expanding the Scope: Beyond Kubernetes +- Cloud-native ≠ Kubernetes-only. +- While Kubernetes dominates discussions, legacy HPC batch systems (Slurm, LSF, Grid Engine, etc.) remain critical. +- Some members argue Spark, Dask, and Ray should be included in discussions, as they also perform scheduling. +- Consider third-party job placement services that optimize across clouds (e.g., Run:AI, YellowDog, Spot Market solutions). +- Live Notes: +- Tentative: move meeting to 8am PST, Tuesdays? +- Intentions for the next year: +- Blitvin: Benchmarks -specific ones? Or research the space? Storage, schedulers affect performance. To help customers understand what works for them. +- Larry - hard to have anything happen without a benchmark (what is the target, what is success). Many future batch is different than previous in terms of math. +- Blitvin: scheduling throughput is an example for what to use for, I/O for jobs, different Chip performance, Code compatability +- Larry - want to look for what is useful coming forward. +- Alex - want to talk about the larger context of things, how to use these things. Be cognizant of the larger world, not only scheduling. +- Alex: Maybe start a survey on what workflows look like. What is the commonality between tooling. +- Marlow - More formalized version of what we gave at the BoF talk at KubeCon? +- Alex - maybe more workflow specific. What are people using - airflow, what types of dags, et cetera. If not using anything or doing by hand, please tell us that? +- Tommy - Wants a scheduler aware of where the data is. What tools do this. One of my goals is to figure out what tooling is available specific to this. +- Larry - To expand -> Likes Alex’s suggestion, but also we should all write these down, and discuss. Do we co-locate or do we have it across the network. Where/how much/is it intermediate? +- Larry - write down, see what outputs are with the immediate group (are we unicorns), second we have no commonality, and then figure out what a larger community is (with patterns). +- Do we want to take the above (what workflows are) and then head towards a PoC to get some numbers (benchmarking?). What is the thoughts around that? diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-02-11.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-02-11.md new file mode 100644 index 000000000..fdedd065d --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-02-11.md @@ -0,0 +1,84 @@ +# 📅 Feb 11, 2025 + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Abhishek Malvankar (IBM) [host] +- Marlow Warnicke (SchedMD) +- Tommy Aido Sonin (Insight Softmax Consulting) +- Andrew Senatar (Coreweave) +- Larry Rudolph (MIT/MOC) +- Boris Litvin (AWS) +- Victor Lu +- Nikita Buldakov (AWS) +- Victor Lu (AI WG Representative) +- Pradeep Madhavarapu (NVIDIA) + +## 📋 Agenda + +- Start with Larry's high-level workflow user story survey doc +- Explain the other three prongs of research: benchmarking, data locality, and landscape research +- TAG reorgs and other housekeeping + +## ➡️ Next Steps + +- 5. Data Locality & Scheduling +- Current batch schedulers lack data-awareness. +- Discussion focused on: +- Making schedulers understand where data resides. +- Using real-time constraints to optimize scheduling. +- Integrating with frameworks like OpenLineage to track data movement. +- Weka and Hammerspace will participate in future meetings. +- 6. Expanding CNCF Batch Scope Beyond Kubernetes +- Most HPC users prefer Slurm, while ML researchers prefer Kubernetes. +- Goal: Improve Kubernetes scheduling for HPC-style workloads to merge the best of both worlds. +- Need better multi-node job placement. +- 7. CNCF Working Groups & Information Sharing +- Many relevant working groups exist, but they don't communicate well. +- Proposal: +- Assign representatives ("emissaries") to attend relevant CNCF meetings and report back. +- Suggested groups to monitor: +- Kubernetes Batch WG +- AI Working Group +- SIG Scheduling +- Device Resource Allocation (DRA) +- Research User Group (RUG) +- Tag Runtime & Workloads +- Action: Identify who already attends these meetings and assign responsibility. +- 8. CNCF Tag Restructuring & Impact on Batch WG +- CNCF is restructuring TAGs. +- The Batch WG will now be a "subproject" under TAG Workloads Foundation. +- Future projects & initiatives should align with this structure. +- 9. Housekeeping & Future Topics +- Co-chair Update: Marlow Warnicke is being onboarded as a co-chair. +- Next Meeting Focus: +- More workflow descriptions. +- Initial data locality & benchmarking discussions. +- Assign emissaries to monitor CNCF groups. +- Start voting on high-priority discussion topics. + +## Discussion Notes + +1. Meeting Time Adjustment +The meeting has officially moved from Monday at 7:30 AM PDT to Tuesday at 8:00 AM PDT. +Participants noted that this time is easier for engagement. +2. Defining Batch Computing Workflows +Larry Rudolph presented a batch computing workflow based on his experience at Two Sigma. +The group is collecting high-level workflow descriptions (without technology specifics) from different industries to compare similarities and differences. +Workflows include: +Fintech (Larry & Alex's model) – Ingestion, normalization, featurization, model training. +ML Research (NVIDIA, AWS, and others) – Needs to be mapped out. +Simulations & Genomics (AWS) – Additional workflows to be added. +Goal: Create a collection of workflow diagrams from different industries. +3. Scheduling & Data Processing Challenges +Traditional cloud systems treat jobs & files independently, but modern workflows involve interdependent processes (e.g., DAGs, real-time streaming). +Challenges: +Data locality – Can schedulers be data-aware? +Job scheduling triggers – Not all jobs can be triggered in a simple DAG. +Multi-node distributed workloads – Especially for AI/ML training. +4. Benchmarking Initiative +Need for standardized benchmarks for batch workloads. +Problems: +Users struggle with correct benchmarking methodologies. +Benchmarks should reflect real-world workloads (ETL, ML training, simulations). +Need to consider hybrid cloud performance and cost efficiency. diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-02-25.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-02-25.md new file mode 100644 index 000000000..0a00c1ccc --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-02-25.md @@ -0,0 +1,51 @@ +# 📅 Feb 25, 2025 + +## 📋 Agenda + +- Start with Larry's high-level workflow user story survey doc +- Benchmarking + +## Discussion Notes + +Attendees +Abhishek Malvankar (IBM Research) +Marlow Warnicke (SchedMD) +Alex Scammon (G-Research) [host] +Larry Rudolph (MIT/MOC) +Victor Lu +Scottie Marlow (Oak Ridge National Laboratory) +Andrew MC +Benchmarking: +Data center benchmarking suite: https://engineering.fb.com/2024/08/05/data-center-engineering/dcperf-open-source-benchmark-suite-for-hyperscale-compute-applications/ +Time vs. Space complexity: +https://pwskills.com/blog/explaining-time-space-complexity-machine-learning/ +CLuster date benchmarks: +https://github.com/google/cluster-data +https://github.com/alibaba/clusterdata +Other: +https://github.com/aliyun/aicb +Key Topics Discussed: +Benchmarking for Batch Systems: +Discussion on how to benchmark batch systems effectively. +The challenge of defining benchmarks beyond single workloads. +Identifying key performance metrics to evaluate different schedulers. +Types of Benchmarks and Profiling: +The need to move beyond single-workload testing to more comprehensive workload profiling. +Combining various approaches like DC-Perf (Meta’s data center performance suite) and Linpack. +Defining benchmarks that measure system performance under various job complexities. +Complexity and Categorization: +Using time complexity and space complexity as foundational units. +Differentiating between simple one-shot jobs, DAG-based workflows, and continuous workloads. +How workload dependencies (e.g., AI/ML pipelines) influence scheduling and data locality. +High-Level Workflow Abstraction & Virtual Data Centers: +Concept of virtual data centers for groups of users (labs, teams, companies). +Understanding system-wide scheduling complexities beyond single job execution. +Managing mixed workloads (HPC, interactive, and batch jobs) across different environments. +Collaborating on Benchmarking Research: +The need for academic collaborations to develop benchmarking methodologies. +Engaging universities and national labs for real-world data and simulation. +Plans to draft a document outlining the benchmarking scope, necessary metrics, and methodologies. +Engaging the Community: +Encouraging broader participation in defining real-world use cases. +Identifying key contributors to expand discussion (e.g., from NERSC, Kubernetes Batch WG). +Potentially using KubeCon to gather survey data on real-world batch scheduling needs. diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-03-11.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-03-11.md new file mode 100644 index 000000000..b04843c6b --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-03-11.md @@ -0,0 +1,59 @@ +# 📅 Mar 11, 2025 + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Marlow Warnicke (SchedMD) [host] +- Tommy Aldo Sonin (Insight Softmax) +- Larry Rudolph (MIT) +- Victor Lu (Independent Free Agent) +- Pradeep Madhavarapu (Nvidia) +- Areg Melik-Adamyan (Intel) +- Bernie Wu (MemVerge) +- Andrew Senetar (CoreWeave) +- Johannes Dieterich (AMD) + +## 📋 Agenda + +- Discussion on paper “The Streaming Batch Model for Efficient and Fault-Tolerant Heterogeneous Execution” +- Continuation of Benchmarking conversation + +## ➡️ Next Steps + +- Further exploration of how batch scheduling integrates with AI/ML workflows. +- Nvidia / Run:AI Post-Kubecon Announcement +- Speculation: +- Run:AI may be open-sourcing parts of their stack. +- They want to engage with the CNCF Batch Working Group post-Kubecon. +- Follow-up Action: +- Schedule a dedicated discussion after April 8th. +- High-Level Workflows & Data Locality Initiatives +- Objective: Define standard AI/ML workflow representations. +- Discussion on integrating: +- Data locality and lineage tracking into scheduling. +- Storage-aware batch scheduling improvements. +- Expand discussions post-Kubecon. + +## Discussion Notes + +Benchmarking Framework Development +Objective: Create standardized benchmarking for HPC, batch workloads, and AI/ML workloads. +Key Concerns: +Ensuring benchmarking submission criteria prevent misrepresentation of performance. +Addressing different workload types: single-node, multi-node, storage I/O, networking. +Capturing real-world workload performance instead of vendor-optimized benchmarks. +Proposed Approach: +Define benchmarking criteria and framework structure. +Seek feedback from industry and academic partners. +Post-Kubecon, engage universities for additional profiling and data gathering. +Paper Presentation: Hybrid Streaming-Batching Model +Presented by: Larry Rudolph (MIT) +Key Takeaways: +Hybrid models combining batch processing and streaming are more efficient for AI/ML workloads. +Importance of resource allocation (GPUs, CPUs, SSDs) to optimize pipeline execution. +Memory pressure and multi-tenant scheduling require buffering solutions. +Multi-Tenant HPC and Workload Scheduling Challenges +Discussion of core concerns: +How schedulers handle batch, streaming, and hybrid workloads. +Impact of storage hierarchy (NVMe, CXL memory pooling) on workload efficiency. +Multi-tenant scheduling within Kubernetes environments. diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-03-25.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-03-25.md new file mode 100644 index 000000000..e6a1285fc --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-03-25.md @@ -0,0 +1,48 @@ +# 📅 Mar 25, 2025 + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Abhishek Malvankar (IBM) [host] +- Marlow Warnicke (SchedMD) +- Victor Lu +- Larry Rudolph (MIT) +- Jonathan Skone (NERSC/LBNL) +- Andrew Senetar (SUNK/Coreweave) +- Boris Blitvin (AWS) +- Tommy (Insight Softmax Consulting) + +## 📋 Agenda + +- Data Locality Sub Group Update +- Benchmarking Subgroup Update +- Kubecon Plans? +- RUN.AI Discussion for post-Kubecon + +## Discussion Notes + +Larry Rudolph presented a conceptual model of DAG-based workflows where each phase can use different schedulers or run at different locations. +The group discussed data handoff inefficiencies, co-scheduling challenges, and the impact of using disparate schedulers or data centers. +Emphasis was placed on inefficiencies and SLA violations caused by fragmented pipeline execution. +Larry introduced the paradox: "lying" about job length in fair schedulers (e.g. Slurm) can lead to better throughput due to backfilling approximating Shortest Job First (SJF). +Andrew and others noted that user misunderstanding of schedulers contributes to scheduling inefficiencies. +⚙️ Scheduler & Infrastructure Discussion +Victor raised the question of comparing Slurm vs Kubernetes-native schedulers for ML training workloads. +Marlow emphasized Kubernetes inefficiencies in node-level CPU resource management (e.g. lack of hybrid shared/static core control). +Alex pointed out successful large-scale use cases of Kubernetes-native schedulers like Armada and MCAD. +The group acknowledged the need to educate users about scheduler behavior and trade-offs. +Post-Kubecon reboot planned. +Focused on extensible benchmarks, especially those handling scale diversity (1 node → 1000s) and training/inference mix. +Collaboration with UNM, Nvidia, Intel, AMD, and performance-focused academics. +Ongoing collaboration to prototype smarter scheduling vs data movement decisions. +Participants: Boris, Frederick, Tommy, Abhishek, and Alex. +More contributors welcome post-Kubecon. +Victor volunteered to start defining key terms (e.g. workflow types, benchmarking layers). +Goal: avoid poetic ambiguity in future discussions. +Alex encouraged members to serve as emissaries to other CNCF SIGs and working groups. +Ricardo's paused Research User Group was mentioned as a potential collaboration target. +Kubecon Plans +Alex, Abhishek, Bernie, and Victor attending. +Alex at TAG Runtime booth (Thurs 2–5pm). +Bernie presenting on transparent cluster checkpointing (Fri 2:30 PM). +Suggestion to post interesting talks in Slack. diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-04-08.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-04-08.md new file mode 100644 index 000000000..6838f53f8 --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-04-08.md @@ -0,0 +1,76 @@ +# 📅 Apr 8, 2025 + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Abhishek Malvankar (IBM) [host] +- Marlow Warnicke (SchedMD) [host] +- Omri Cohen (Nvidia) +- Ekin Karabulut (Nvidia) +- Timothy Middelkoop (Internet2) +- Pradeep Madhavarapu (NVIDIA) +- Tommy Aldo Sonin (ISC aka Insight Softmax) +- Andrew Senetar (CoreWeave) +- Sohan Kunkerkar (Red Hat) +- Scottie Marlow (ORNL) +- Adam Tetelman (NVIDIA) +- Alan Mutschelknaus (SchedMD) +- Bernie Wu (MemVerge) +- Jonathan Skone (NERSC) +- Kevin Hannon (Red Hat) +- Kyle +- Norman Joyner (StackAV) +- Rob Esker (NVIDIA) +- Roman Baron (NVIDIA) +- Scottie Marlow (ORNL) +- Vinay Sawal (NERSC) +- Dan Desjardins (Distributive) +- Wes Garland (Distributive) + +## 📋 Agenda + +- KAI from RUN.AI/NVIDIA +- Kubecon Updates + +## Discussion Notes + +Kai Scheduler Overview Developed by Run:AI (acquired by Nvidia), Kai focuses on managing AI/ML workloads in Kubernetes. +Targets AI/ML use cases like training, inference, and interactive notebooks. +Designed for high-scale, multi-tenant GPU clusters. +Prioritizes minimizing GPU waste and enabling dynamic resource sharing. +Key Scheduling Concepts: Pod Groups & Quotas Core abstractions provide control over batch and gang scheduling behaviors. +PodGroups: First-class scheduling units with elastic/gang semantics. +Queues: Define resource guarantees, overages, and priorities. +Enables hierarchical, fair-share resource control across teams and projects. +Workload Optimization & Efficiency Techniques for maximizing cluster utilization with minimal disruption. +Fractional GPU sharing with memory isolation (enterprise-only for now). +Simulation-based eviction logic avoids unnecessary job disruption. +Prioritization for training vs. inference vs. interactive workloads. +Extensible, Plugin-Based Architecture Designed for modularity and integration with the Kubernetes ecosystem. +Scheduler actions include allocate, consolidate, reclaim, and preempt. +Plugins for queue logic (DRF), job/queue ordering, node scoring, etc. +Compatible with Kubernetes-native features like DRA and autoscalers. +Broad Integration & Ecosystem Compatibility Supports diverse Kubernetes-native and ML-specific workloads. +Dynamic PodGroup assignment via a pod grouper controller. +30+ workload types supported: Kubeflow, Volcano, TrainingOperator, Knative, etc. +Maintains workload-agnostic philosophy to support varied user tooling. +Multi-Cluster and Node Pool Design +Kai runs a separate scheduler per node pool +Node pools used for hardware isolation (e.g., A100 vs T4) +Cluster-wide round-robin submission layer (currently not open source) +Open Source Roadmap +All development in the open; no proprietary fork +Near-term goal: CNCF open governance +Potential integration targets: Kueue, Armada, and cluster autoscalers (incl. Karpenter) +Interest in deeper integration for topology-aware scheduling +Suggestion: explore Kubernetes scheduling gates and clarify which quota system dominates +Noted dual quota systems (Kueue + Kai) might need harmonization +Kai is designed generically, but fractional GPU sharing is Nvidia-only for now +Team open to supporting AMD, Gaudi, Intel, etc. +Mixed-accelerator clusters pose more complex design challenges +Questions around scheduling hierarchy (single vs. federated schedulers) +Consensus that multi-cluster is complex and warrants phased planning +Proposal: Unified Pod Group Spec +Noted similar constructs across Kai, MCAD, Kueue (e.g., AppWrapper) +Suggestion to align on a shared, minimal “pod group” spec +Potential follow-up working session across scheduler projects diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-04-22.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-04-22.md new file mode 100644 index 000000000..854504aa4 --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-04-22.md @@ -0,0 +1,59 @@ +# 📅 Apr 22, 2025 + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Abhishek Malvankar (IBM) [host] +- Marlow Warnicke (SchedMD) [host] +- Ekin Karabulut (NVIDIA) +- Pradeep Madhavarapu (NVIDIA) +- Mikhail Mokrushin (Nebius) – Engineering lead for Soperator +- Roman Luchkov (Nebius) – Product Manager +- Scottie Marlow (ORNL) +- Wes Garland (Distributive) +- Dan Desjardins (Distributive) +- Jonathan Skone (NERSC, LBNL) +- Scotty Marlowe (Oak Ridge National Lab) +- Victor Lu (ex-Oracle) +- Alan Mutschelknaus (SchedMD) +- Skyler Malinowski (SchedMD) +- Bernie Wu (MemVerge – Checkpointing) +- Colin Mixon (Microsoft Azure) + +## 📋 Agenda + +- Say Hello +- New Humans (Hello, why are you here) +- Updates +- Data Group +- Benchmarking +- User Stories +- Definitions +- Discussion +- Soperator + +## Discussion Notes + +Ongoing Working Group Topics +📊 Benchmarking: Marlow is coordinating efforts to define meaningful batch metrics beyond traditional HPC workloads (e.g., LINPACK, HPCG). Still organizing due to post-KubeCon PTOs. +📦 Data Locality: Group exploring intelligent scheduler decisions for moving compute to data vs. vice versa. Includes workload modeling and empirical results from Larry’s shared papers. +🔄 Batch Workflow Use Cases: Alex added real-world quant research workflows to Larry’s original doc. Highlights the chaotic nature of exploratory work vs. structured production batch jobs. +📘 Definitions: Victor leads terminology unification between HPC and Cloud Native communities, starting from a CNCF doc and expanding via community feedback. +Presentation: Separator by Nebius +🎯 Separator is a fully open-source Kubernetes operator for provisioning dedicated Slurm clusters. +🧩 Goals: Preserve native Slurm UX for ML researchers, enable multi-tenant managed service delivery with minimal Kubernetes expertise. +Each Slurm cluster lives in its own namespace. +Uses Kubernetes for lifecycle management, but not scheduling. +Persistent shared root FS to mimic traditional HPC environments. +Works in production on Nebius infrastructure (e.g., Nvidia workloads). +Faster provisioning for PoCs +Kubernetes-native HA, scalability, and observability +Clear separation of user/admin concerns via container chroot tricks +Community Trends in Slurm on Kubernetes +Mentioned alternatives: Slinky (SchedMD), KubeFoundry (Nvidia), SUNK (CoreWeave). +Shared desire: run familiar Slurm interfaces atop scalable cloud-native infra. +Debate: whether shared clusters for inference/training offer value vs. reliability concerns → usually results in ring-fenced usage. +Follow-up Ideas +📚 Comparative study suggested: Separator vs. Kai (Run:AI) vs. others. +🎯 Goal: Reduce fragmentation, converge on standards for scheduling, interfaces, and observability. +🔧 Separator: Slurm-as-a-Service on Kubernetes A practical operator built by Nebius to serve ML customers who want Slurm without learning K8s. Empowers provisioning, automation, and observability—while leaving the Slurm user experience untouched. diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-05-06.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-05-06.md new file mode 100644 index 000000000..e678cb782 --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-05-06.md @@ -0,0 +1,67 @@ +# 📅 May 6, 2025 + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Abhishek Malvankar (IBM) [host] +- Marlow Warnicke (SchedMD) [host] +- Dan Desjardins (Distributive) +- Wes Garland (Distributive) +- Ekin Karabulut (Run AI team at NVIDIA) +- Filip Novovic (Insight Softmax Consulting) +- Niranjan Ravichandra (Cedana) +- Omri Cohen (Run AI KAI scheduler) +- Tommy Aldo Sonin (Insight Softmax Consulting) +- Pedro Oliveira (Spectrocloud) +- Andrew McCauley (G-Research) +- Victor Lu (Independent) +- Roman Baron (NVIDIA KAI scheduler / Run.ai) +- Scottie Marlow (ORNL) +- Pedro Oliveira (Spectro Cloud) + +## 📋 Agenda + +- Say Hello +- New Humans (Hello, why are you here) +- Updates +- Data Group +- Benchmarking +- User Stories +- Definitions +- Discussion +- Super brief Kubecon roundup and observations + +## Discussion Notes + +Benchmarking Initiative Updates +Marlow Warnicke shared updates on the AI benchmarking initiative. +Focused on developing well-defined workloads, similar to Linpack and HPCG for AI systems. +Objective: Create apples-to-apples comparisons of AI workloads across systems. +Challenges: Characterizing workloads, scaling tests, and representative benchmarks. +Data Locality Initiative +Dan Desjardins presented a model for data locality. +Discussion on the importance of moving compute to data or vice versa based on costs and efficiency. +University of Chicago paper referenced that claimed a 7% scheduling optimization improvement with locality-aware policies. +Plans to build a framework for testing different scheduler policies on AWS, with support from Boris. +Vocabulary Definitions Initiative +Victor Lu is leading a project to standardize vocabulary across old-school HPC and cloud-native communities. +Purpose: Improve communication and understanding within the community. +Kubecon Trends: Multi-Cluster and Batch Scheduling Gaining Steam +Emerging importance of multi-cluster scheduling in HPC and cloud-native workloads. +Discussion around KubeCon trends indicating rising interest in multi-cluster orchestration. +Volcano and KAI Scheduler were highlighted as active projects in this space. +Interest in hybrid cloud setups with multi-cluster scheduling. +Framework Search for Data Locality Testing +Current solutions (e.g., University of Chicago’s repo) are too specific and not easily adaptable. +The group is searching for a more flexible testing framework. +Standardization and API Changes +Discussion on how to simplify Kubernetes API machinery to improve multi-pod and multi-cluster scheduling. +Marlow noted existing Kubernetes primitives are insufficient for large-scale multi-pod batch scheduling. +Ideas for improvements were raised, including better job grouping and ordering capabilities. +Next Meeting: Kevin from Red Hat / Kubernetes Batch Working Group / Kueue will join to discuss multi-cluster scheduling and all your other Kueue questions. +Action Items for the group: +Continue defining the Benchmarking Initiative. +Data Locality Testing Subgroup to meet +Explore standardizing Kubernetes API enhancements for batch scheduling. +Participate in Vocabulary Definitions to align terms across cloud-native and HPC. +High-level Batch Workflow User Stories diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-05-20.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-05-20.md new file mode 100644 index 000000000..8aeef06ef --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-05-20.md @@ -0,0 +1,72 @@ +# 📅 May 20, 2025 + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Abhishek Malvankar (IBM) [host] +- Marlow Warnicke (SchedMD) [host] +- Tommy Aldo Sonin (Insight Softmax Consulting) +- Pradeep Madhavarapu (NVIDIA - Storage and Data) +- Kevin Hannon (RedHat) +- Alan Parry +- Jason Parraga +- Wes Garland +- Ekin Karabulut (Kai NVidia) +- Scottie Marlow (ORNL) +- Omri Cohen (Kai Nvidia) +- Roman Baron (Kai NVidia) +- Alan Mutschelknaus (SchedMD) +- Jejan Zele Pjchev +- Skyler (SchedMD) +- Victor Lu (Independent) +- Erez Freiberger + +## 📋 Agenda + +- Say Hello +- New Humans (Hello, why are you here) +- Updates +- Data Group +- Benchmarking +- User Stories +- Definitions +- Discussion +- Kueue Discussion +- Brief on ToC plans +- (if time) New KAI Scheduler feature: Guaranteed Minimum Runtime Before Preemption/Reclaim + +## Discussion Notes + +Discussion Notes +Welcome & Introductions & Standing Updates +Brief participant intros +Updates on: +Data Locality Subteam +Benchmarking +High-level workflows user stories +Batch Definitions +Topic: Kevin on Kueue & Multi-Cluster Scheduling +Kueue Design Goals: +Batch scheduling framework for Kubernetes. +Supports queue-based scheduling, PodSets, integration with JobSet/MPIJob/Volcano. +Emphasizes declarative APIs and plugin-compatibility. +Multi-Cluster Scheduling: +Kueue is currently single-cluster by design. +Possibility to run Kueue instances in each cluster and use higher-level orchestration to coordinate. +No native support yet for global queues, remote resource visibility, or policy federation. +Gang Scheduling & Locality: +Gang scheduling via PodSets, but lacks native support for topology-aware placement. +Scheduling intelligence often offloaded to custom controllers or frameworks. +Key Questions Raised: +Could Kueue evolve to support cross-cluster workloads? +How can multiple schedulers (Kueue, Volcano, Kai) avoid fragmenting the ecosystem? +How extensible is Kueue for experimentation with data-locality policies? +Topic: New proposed feature for Kai Scheduler: Preemption Grace Period +The Kai team is introducing a feature that allows workloads a grace period before being preempted by other jobs — whether from other queues or higher-priority workloads within the same queue. +The intent is to reduce abrupt interruptions of in-flight jobs, offering a configurable delay before preemption occurs. +A detailed design document has been created, with examples covering several anticipated use cases. +Omri invited the working group to review the design and provide feedback, particularly in case there are use cases the team hadn’t yet considered. +Next Steps and Action Items +Assign research tasks and documentation responsibilities +Prepare shared Google Doc for experiment definitions and workload proposals +Group members are encouraged to review the design doc for Preemption Grace Period and contribute suggestions or questions to refine the feature. diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-06-03.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-06-03.md new file mode 100644 index 000000000..78019292d --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-06-03.md @@ -0,0 +1,81 @@ +# 📅 Jun 3, 2025 + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Abhishek Malvankar (IBM) [host] +- Marlow Warnicke (SchedMD) [host] +- Kevin Hannon (Red Hat) +- Johan Jatko (NVIDIA) +- Alan Parry - Runs YellowDog +- Bugra Gedik - Nvidia human on KAI +- Filip Novovic (ISC) +- Dan Desjardins - Canada - Distributive +- Dejan Zele Pejchev - GResearch maintainer of Armada and operator +- Dennis Marttinen (Aalto University) - Masters student in Finland, moving to the national supercomputing lab in Finland +- Ekin Karabulut - NVida, KAI +- Jason Kincl - RHT, background in HPC +- Omri Cohen - NVIDIA, KAI +- Pedro Oliveira +- Pradeep Madhavarapu, NVIDIA +- Roman Baron - NVIDIA, KAI +- Scottie Marlow - ORNL +- Tommy Aldo Sonin - (ISC) +- Victor Lu - Independent +- Wes Garland - works with Dan - Distributive + +## 📋 Agenda + +- Say Hello +- New Humans (Hello, why are you here) +- Updates +- Data Group +- Benchmarking +- User Stories +- Definitions +- Discussion +- New KAI Scheduler feature: Guaranteed Minimum Runtime Before Preemption/Reclaim (Ekin, Omri) – continued from last meeting +- New Initiative: Cloud Native HPC (Dennis Marttinen, Diego Ciangottini) +- llm-d (Abhi) + +## Discussion Notes + +Discussion Notes +Subproject Overview +Active initiatives include: +Data Locality +Ongoing experiments explore: +Comparing performance when compute is scheduled near data vs. when data is moved to compute. +Previous calls with UChicago and Distributive highlighted I/O bottlenecks and cluster awareness challenges. +Goals include identifying scheduler behaviors that reduce latency and improve data proximity outcomes. +Dennis’ project may form a fifth initiative. +🛠 Feature Proposal: Preemption Grace Period +Overview +Omri Cohen (NVIDIA) presented a feature currently under development for the Kai Scheduler, but applicable more broadly to batch scheduling systems. +The core idea is to give jobs a short grace period before being preempted or reclaimed. This allows workloads to finish critical operations like: +Writing checkpoints +Cleaning up intermediate outputs +Exiting gracefully +Use Case +In multi-tenant environments where queue fairness or resource rebalancing leads to frequent preemptions, abrupt job termination can: +Waste compute cycles +Lead to data corruption or lost progress +Undermine the reliability of scheduled workloads +A grace period could mitigate this by introducing a buffer during which: +The job can detect it is about to be interrupted +The scheduler respects a small time window before forcibly reclaiming the resource +Design Highlights +The proposal is documented in a detailed design doc (linked in the meeting chat). +Features include: +Configurable grace durations +Applicability across queues or within single-queue preemptions +Scenarios where jobs may cede voluntarily within their grace period if checkpointing finishes quickly +Call to Action +Omri encouraged feedback from the group, particularly around: +Edge cases (e.g., GPU jobs with large memory states) +Interactions with job annotations or pod priorities +Potential integration into other schedulers (Volcano, custom plugins) +The group agreed this could be a critical improvement for supporting ML/AI jobs that run for long durations but are sensitive to failure. +Next Steps +A full walkthrough of the Kai scheduler’s architecture (and where this feature fits in) is planned for a future meeting. +WG members interested in contributing or reviewing the feature were invited to comment on the design doc. diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-06-17.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-06-17.md new file mode 100644 index 000000000..80b46718a --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-06-17.md @@ -0,0 +1,63 @@ +# 📅 Jun 17, 2025 + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Abhishek Malvankar (IBM) [host] +- Marlow Warnicke (SchedMD) [host] +- Filip Novovic (Insight Softmax Consulting) +- Wes Garland (Distributive) +- Carl Nettelblad (Uppsala University, Sweden) +- Sanyo John (Khalifa University) +- Ekin Karabulut (NVIDIA) +- Dennis Marttinen +- Dejan Pejcev (G-Research) +- Omri Cohen (NVIDIA) +- Bernard Wu (MemVerge) +- Dan Desjardins (Distributive) +- Timothy Middelkoop (Internet2) +- Jason Kincl (Red Hat) +- Jonathan Skone (NERSC) +- Victor Lu (Independent) +- Diego (working with Dennis, was on very briefly) + +## 📋 Agenda + +- Say Hello +- New Humans (Hello, why are you here) +- Updates +- Data Group +- Benchmarking - On hold until Fall +- User Stories +- Definitions +- Discussion +- New Initiative: Cloud Native HPC (Dennis Marttinen, Diego Ciangottini) +- llm-d (Abhi) +- Slurm-bridge (Marlow) + +## Discussion Notes + +Discussion Notes +📊 Presentation: ML Benchmarking Stack & Topology-Aware Scheduling (Dennis Marttinen, Aalto University) +Overview & Motivation +Dennis presented an evolving framework to study how infrastructure topology impacts ML workloads under batch scheduling. +The goal is to develop reproducible tests that simulate real-world ML jobs while exposing the scheduling system to measurable topology trade-offs. +System Architecture +The proposed framework includes: +“Submission engine” to trigger batches of ML jobs using YAML-based workload definitions. +A coordinator or controller layer to interpret cluster topology and handle scheduling intelligence. +Backend runtimes (e.g. Kubernetes or SLURM) as execution targets. +YAMLs describe not just job resources, but topology intents (e.g., “prefer local GPU memory”, “avoid inter-node bandwidth bottlenecks”). +Example Use Cases +Detecting GPU placement inefficiencies (e.g., when multi-GPU jobs get scheduled across NUMA boundaries or separate hosts). +Simulating cluster heterogeneity — jobs run differently depending on whether scheduled in the cloud or on-prem. +Comparing job efficiency under various scheduling assumptions (locality-first vs fairness-first). +Current Status +A working version exists and runs with HuggingFace + PyTorch Lightning workloads. +Dennis showed logs, sample metrics, and preliminary performance data. +Community Discussion +Group aligned on the importance of this tooling to: +Anchor upcoming benchmarking efforts +Help validate scheduler plugins or policies (e.g., Volcano, Run.ai, Kai) +Interest in contributing workload definitions from other members (e.g., simulation, genomics, ETL). +Discussion about whether this becomes part of the existing “data locality” subproject or its own standalone stream. diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-07-01.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-07-01.md new file mode 100644 index 000000000..fa32d0d84 --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-07-01.md @@ -0,0 +1,46 @@ +# 📅 Jul 1, 2025 + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Abhishek Malvankar (IBM) [host] +- Alan Parry (YellowDog) +- Bernie Wu (Voltron Data) +- Carl Nettelblad (Uppsala University) +- Dan Desjardins (Distributive) +- Dennis Marttinen (CSC – IT Center for Science, Finland) +- Ekin Karabulut (NVIDIA) +- Jason Kincl (Red Hat) +- Mofi (Unknown Affiliation) +- Omri Cohen (NVIDIA) +- Tommy Aldo Sonin (Insight Softmax Consulting) +- Discussion Notes +- 🔍 Topic: LLMD (LLM Distributed Inference) +- Abhishek (IBM Research) introduced LLMD as a potential equivalent of “Apache Tomcat” for LLM inference — a reusable, scalable, possibly universal inference system. +- The focus is on enabling multi-node distributed inference using large language models that don’t fit on a single node. +- LLMD is designed to complement or extend vLLM by handling workloads that require tensor parallelism or pipeline parallelism across nodes. +- ❓ HPC vs Inference +- A core question emerged: Should LLM inference be classified as HPC? +- Carl argued that inference has predictable performance and more relaxed scheduling requirements than training — more like service workloads. +- Alex noted that the trend toward multi-node, latency-sensitive inference (e.g., in scientific pipelines) is pushing the boundaries toward HPC territory. +- Alan shared perspective from YellowDog users — inference isn’t treated like HPC by default but can be HPC depending on scale and context. +- Jason (Red Hat) noted that commercial users often just want “the thing to run,” not caring whether it’s HPC, batch, or something else. +- Multi-node inference adds orchestration complexity and may not cleanly map onto existing service mesh or MPI-based assumptions. +- Bernie raised concerns about data locality and model sharding, especially with large-scale inference across regions. +- What primitives or scheduler features do inference workloads need that HPC schedulers already solve? Should this WG write a white paper or blog post outlining the taxonomy of inference workloads (online, batch, HPC-like)? +- Could LLMD become a standard component in cloud-native HPC inference pipelines? +- Schedule a follow-up meeting focused specifically on LLMD and inference job classification. +- Reach out to teams doing multi-node inference in production (e.g., HuggingFace, Together.ai, MosaicML) for input or demos. +- Invite more representation from academic compute centers doing LLM inference at scale. + +## 📋 Agenda + +- 👋Welcome and Introductions (Hello, why are you here) +- Updates +- Data Group +- Benchmarking +- User Stories +- Definitions +- Discussion +- 🧠 Primary Topic: LLMD and Multi-Node Inference +- 🤔 Group Discussion: Are inference workloads HPC? diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-07-15.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-07-15.md new file mode 100644 index 000000000..6854ccd85 --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-07-15.md @@ -0,0 +1,53 @@ +# 📅 Jul 15, 2025 + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Abhishek Malvankar (IBM) [host] +- Marlow Warnicke (SchedMD) [host] +- Diego Ciangottini - INFN - interink-project.dev maintainer +- Omri Cohen (NVIDIA) +- Vivian Hafener (SchedMD) +- Alex Kimber AWS, London (HPC specialists for financial services) +- Carl Nettelblad (Uppsala University) +- Alan Parry (Yellow Dog) +- Alan Mutschelknaus (SchedMD) +- Bernie Wu (Memverge) +- Skyler Malinowski (SchedMD) +- Sanyo John (Khalifa University, Abu Dhabi) +- Pradeep Madhavarapu (NVIDIA) +- Wes Garland (Distributive) +- Jason Kincl (Red Hat) +- Tommy Aldo Sonin (Insight Softmax Consulting) +- Dennis Marttinen (CSC – IT Center for Science, Finland) +- Victor Lu (Independent) +- Dan Desjardins (Distributive) +- Seaborn [Systems company - audio issues prevented clear identification] +- Alan Parry (YellowDog) +- Ekin Karabulut (NVIDIA) + +## Discussion Notes + +👋 Welcome & Introductions (“Hello, why are you here?”) +🔍 Bringing Dennis back — Continue the conversation on gaps in the Kubernetes ecosystem for HPC +🧪 Slurm-bridge demo — Led by Alan Mutschelknaus +Subgroup Updates +Data Locality: Progress on testing frameworks and evaluating approaches like BatSim/SimGrid. Ongoing exploration of AWS resources (Tommy to connect with Boris). +Benchmarking: Temporarily paused, with plans to revisit in fall after major releases. Stream vs. HPCG vs. LINPACK benchmarks compared. +Definitions: Work continues on aligning terminology (e.g., preemption vs. reclaim, job vs. workload) to avoid cross-project confusion. +Ecosystem Gaps (Dennis’ thread continued) +Kubernetes lacks primitives needed for advanced HPC scheduling (e.g., data-aware scheduling, preemption guarantees, topology hints). +Need to assess whether these belong in core Kubernetes or as scheduler extensions. +Agreement to continue documenting gaps systematically for WG charter deliverables. +Slurm-Bridge Demo (Marlowe) +Preview of Slurm integration bridge allowing workloads to flow between Slurm-managed clusters and Kubernetes. +Emphasis on how this enables hybrid HPC/cloud environments and makes Kubernetes more viable in supercomputing contexts. +Early adopter interest expressed; further demos planned. +General Takeaways +Strong cross-vendor interest (NVIDIA, Red Hat, YellowDog, Distributive, CSC) in bridging HPC practices into Kubernetes-native environments. +Recognition that WG should maintain a balance: practical near-term prototypes (Slurm bridge, BatSim tests) and longer-term ecosystem change (K8s primitives, CNCF charter goals). +Consensus to bring Dennis back for a deeper dive into ecosystem gaps. +✅ Next Steps +Tommy to follow up with Boris on AWS hardware testing options. +Dennis to prepare follow-up on ecosystem gaps. +Marlow to schedule a fuller Slurm-bridge demo. diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-07-29.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-07-29.md new file mode 100644 index 000000000..4dbb8e19e --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-07-29.md @@ -0,0 +1,136 @@ +# 📅 Jul 29, 2025 + +## 👥 Attendees + +- Alan Parry - (Yellow Dog) +- Marlow Warnicke - SchedMD +- Alex Scammon - G research (meeting host) +- Victor Lu +- Dejan Zele Pejchev (G-Research) +- Filip Novovic (Insight Softmax) +- Vivian Hafener - SchedMD +- Abhishek Malvankar (IBM) +- Carl Nettelblad +- Bernie Wu (MemVerge) +- Jonathan Skone - NERSC +- Tommy Aldo Sonin - ISC (Insight Softmax) +- Alan Mutschelknaus - SchedMD (primary presenter) +- Ekin - (KAI/NVIDIA) +- Barret Abel + +## Discussion Notes + +👋 Welcome & Introductions (“Hello, why are you here?”) +Slurm Bridge +Link to presentation +Marlow/SchedMD: Slurm Bridge presentation accepted +Dejan: Deep dive on multi-cluster schedulers accepted +Alex & Marlow: Panel discussion wait-listed +Victor: Non-batch talks for AI Day +Abhishek: Cloud AI Day submission +Data Locality Subgroup: +Recent presentation: University of Indiana on Atlas and Rucio +Key finding: Second confirmed scheduler (after LSF) that leverages data location for scheduling decisions +Research question: Move data to compute or compute to data? +Tools: BatSim and SimGrid for simulations +Other initiatives: Benchmarking (on hold until Marlow returns), Vocabulary (Alex/Victor picking up), Dennis/Diego white paper (continuing) +🔍 Main Topic: Slurm Bridge Deep Dive (Alan Mutschelknaus, SchedMD) +Slides: https://slurm.schedmd.com/MISC25/Slurm_Bridge_CNCF-Batch-20250729.pdf +Continuation from July 15th meeting +Architecture recap: +Operates as Kubernetes Scheduler +Sits between K8s and Slurm as translation layer +Translates K8s pods → Slurm "placeholder jobs" +Slurm decides when pods run +Uses Slurm's "external job" concept — no slurmd on compute nodes required +Scheduling framework: +Pre-filter and filter plugins (custom) +Default bind plugin (standard K8s) +Future: DRA integration will require additional plugin points +🎯 Gang Scheduling with Leader Worker Sets +The problem: +Leader Worker Sets require all pods running together (AI inferencing workloads) +Default K8s scheduler: Can partially schedule → deadlock/thrashing +Pods get scheduled individually, some succeed, some fail +After ~30 seconds, Leader Worker Set tears down and retries +Slurm Bridge solution: +Bundle all Leader Worker Set pods into single Slurm placeholder job +Slurm understands all must run together +Won't start any pods until resources available for entire group +Prevents partial scheduling and resource deadlock +🎮 GPU Scheduling Discussion (Primary Technical Focus - Abhishek) +Current state: +Working on Device Plugin integration first +Translate pod GPU requests → Slurm Grez/Trez concepts +Slurm aware of GPU requirements for scheduling +Actual GPU usage still handled by Kubernetes pod +Challenge identified: +Nvidia GPU Operator limitation: Provides quantity but not specific GPU selection +AI workload requirements: Need specific GPU for NVLink, RoCE, interconnect topologies +Roadmap: +Device Plugin support (current) +DRA (Dynamic Resource Allocation) integration (future) +Nvidia DRA includes "compute domain" for multi-node placement +CPU plugin integration also planned +Open question: How to ensure K8s selects GPU that Slurm expects? +Still being worked through +May involve custom labels/environment variables +Marlow's note: Community work on resource alignment to modify pod specs appropriately +⚙️ Backfill Scheduling Philosophy (Alex's question) +Impedance mismatch: +Slurm: Maximize shared resource utilization +Cloud: Assumes infinite compute availability +Slurm backfill scheduler: +Wants visibility into all future work +Optimizes by "sneaking in" smaller jobs +vs. Kueue: Holds back work until it can run +Implementation status: +Backfill capability exists with many knobs +Not yet deeply integrated into Slurm Bridge +Goal: Engage backfill to push Slurm's optimization strengths into K8s +🔄 Autoscaling Integration (Filip's question) +Question: How does Slurm Bridge work with K8s autoscaling (Carpenter)? +Answer: +Slurm's "external nodes" capability helps +Need: Callback from autoscaler to Slurm control plane +Inform Slurm when nodes added to partition +No dynamic detection yet +Barrett's experience: Has "kludgy carpenter mousetrap" working at POC level +🐶 Yellow Dog Integration Perspective (Alan Perry) +Two modes: +Provision-only: Provision sub-scheduler, users work directly (e.g., Ray) +Full integration: Provision, scale, route workload through Yellow Dog (e.g., Slurm, SQS Batch) +Future goal: Better understanding of subsystem capacity/utilization +🧩 Kubernetes Primitives Discussion (Marlow's Proposal) +Current state: +Jobs, Job Sets, Leader Worker Sets, Pod Groups +Each designed for specific use cases +Hard to transfer between domains +Proposal: New initiative to: +Be more intentional about K8s primitives for batch +Simplify for scheduler consumption +Key stakeholder: Kevin Hannon (Kueue) - should be pulled in +Follow-up: Deferred to future meeting +Motivation: Many relevant K8s SIGs, don't want everyone attending everything +Model: Distribute attendance, share 1-min briefings at batch WG meetings +Initial assignments: +SIG Scheduling issues: +US meeting cancelled more often than not +Europe meeting more consistent +Opportunity: European participants could help revive +https://arxiv.org/abs/2503.19470 +https://arxiv.org/abs/2504.11536 +Topic: ReTool system +How to coordinate GPU selection between Slurm and K8s? +Should backfill scheduler be engaged for K8s workloads? +Can autoscaling integrate smoothly with Slurm Bridge? +What K8s primitives are actually needed for batch workloads? +Which SIGs need ongoing tracking by this working group? +SchedMD: Continue GPU/DRA integration work +Marlow/Alex: Engage Kevin Hannon for primitives discussion +SIG Volunteers: Begin attending and preparing briefings +Alex: Send reminder of SIG attendance assignments +August: Alex may be absent 1-2 meetings (meetings will continue) +"We've just deployed a very large supercomputer AI factory, one of the top 21. We're moving away from Slurm to cloud native schedulers." — Barrett (on Novo Nordisk deployment) +"We have jobs, job sets, Leader Worker Set, and Pod Group... they're all designed for very specific use cases. But they're hard to transfer to different domains." — Marlow +"Slurm might want to know about all future work so that it can optimize what's coming down and scheduling things that way... that's also kind of opportunity for Slurm Bridge to push those concepts into Kubernetes." — Alan M. diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-08-12.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-08-12.md new file mode 100644 index 000000000..828a34758 --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-08-12.md @@ -0,0 +1,111 @@ +# 📅 Aug 12, 2025 + +## 👥 Attendees + +- Abhishek Malvankar - Meeting facilitator +- Omri Cohen - NVIDIA (primary presenter) +- Ekin Karabulut - Run.ai/Kueue team +- Kevin Hannon - Kueue team +- Dan Desjardins - Distributive +- Victor Lu +- Barrett Abel - Novo Nordisk (joining from airplane, off camera) + +## Discussion Notes + +👋 Welcome & Introductions (“Hello, why are you here?”) +Time-aware Fairness (Omri and Ekin from Run.ai/NVIDIA) +Design Doc (link) +Example PR (link) +🔍 Topic: Time-Aware Fairness in Resource Scheduling +Omri Cohen (NVIDIA) presented KAI scheduler's approach to adding historical usage awareness to resource allocation +Current problem: Instantaneous fair share only — no memory of past usage +Design doc and PR: https://github.com/NVIDIA/KAI-Scheduler/pull/311 +📊 Current KAI Fair Share Model (3-Tier) +Deserved Quota — guaranteed minimum resources (reserved) +Queue Priority — leftover resources allocated by priority tiers +Over-Quota Weights — relative allocation within priority tier (e.g., weight 2 = 2x resources vs weight 1) +Node pools divide cluster into homogeneous sections for quota management +🚨 The Problem: Resource Hogging Without History +Example scenario: +2 queues, equal weights, 16 GPUs total +Fair share: 8 GPUs each +Both want 16-GPU gang jobs (all-or-nothing) +Current behavior: Q1 allocated first (arbitrary tiebreaker: creation time) +Result: Q1 runs forever, Q2 never gets resources +Desired behavior: Oscillating pattern based on usage history — Q1 runs, accumulates "debt," Q2 reclaims resources +💡 Proposed Solution: Time-Aware Fair Sharing +Core concept: Historical usage affects queue priority to achieve fairness over time rather than instantaneously +Key design principles: +Normalize usage to cluster capacity +Using 1 GPU-hour when cluster vacant → minimal penalty +Using 1 GPU-hour when cluster fully utilized → significant penalty +Formula: Normalized Usage = (GPU hours used) / (Cluster capacity during usage period) +Compare to deserved share +Calculate normalized weight (e.g., 2 queues weight 1 each → 0.5 normalized weight) +Compare normalized usage to normalized weight +Usage = weight → queue used its "fair share" +Integration with DRF (Dominant Resource Fairness) +Still uses DRF for queue ordering +Time-aware fairness skews DRF weights +Queues with past usage considered "less starved" +Non-contended resources don't affect ordering +📈 Comparison: Round Robin → DRF → Time-Aware +Round robin: Simple alternation, ignores resource requirements +DRF: Better, considers multiple resource types, instantaneous only +Time-aware: Builds on DRF, adds historical usage consideration +Admission Fair Sharing introduced last release (Alpha) +Prioritizes workloads from users not currently using resources +Link: https://kueue.sigs.k8s.io/docs/concepts/admission_fair_sharing/ +KEP: https://github.com/kubernetes-sigs/kueue/tree/main/keps/4136-admission-fair-sharing +Kevin's observations: +Similar gap in cloud-native vs HPC schedulers +HPC model: Users get X compute hours, use freely, then blocked or reduced priority +Every organization handles differently +Open questions: Wall time vs compute time? GPU hours vs CPU hours? +LSF has dynamic fair-share: https://www.ibm.com/docs/en/spectrum-lsf/10.1.0?topic=share-dynamic-user-priority +Existing HPC schedulers already address this problem +❓ Key Technical Questions +Usage calculation (Abhishek): +Based on Kubernetes allocation, not actual GPU utilization +Not measuring DCGM metrics (yet) +Off-peak usage penalty: +Should using resources at 3 AM (when vacant) count the same as peak usage? +Current design: No — normalized to capacity makes off-peak usage count less +GPU count vs time: +Using 100 GPUs for 1 hour vs 1 GPU for 100 hours — same penalty? +Current approach: Track GPU-hours, normalize to capacity +🎮 User Gaming Prevention (Abhishek's concern) +Real-world observation from NSF clusters: +Users jump between queues to game the system +Submit to whichever queue runs fastest +Creates hot/cold cycling +Planned solution (Phase 2): +Track per-user allocation data (not yet implemented) +User's past usage affects position within queue +Jumping queues → lower priority in new queue +Still allows legitimate multi-initiative work +Current state: This document focuses on queue-level fairness only +Thermodynamics analogy (Dan Desjardins): +Partition function formula resembles thermodynamics +Could add "temperature" term for non-linear tuning +Exponential weighting for more sensitive allocations +CPU vs GPU time tracking (Victor Lu): +Can system differentiate CPU vs GPU time in real-time? +Dan showed experimental scheduler tracking CPU seconds vs GPU seconds separately +Job reports show breakdown of compute requirements +Real-time aggregate tracking possible +Victor's interest: ML compiler implications for scheduling (noted as未讨论 topic) +What metrics? Wall time, compute time, or GPU-hours? +Penalty decay? Should old usage matter less than recent? +Resource weighting? GPU-hours vs CPU-hours — equal weight? +Utilization vs allocation? Should actual usage (DCGM) factor in? +How to define "fair"? Every organization may differ +Community: Review design document, provide feedback via Slack +Omri/Ekin: Share design document link +Dan: Share experimental scheduler approach for CPU/GPU time tracking +NVIDIA/Run.ai: Continue Phase 1 development (queue-level) +Phase 2: User-level tracking to prevent gaming +Community: Test feature when released +"It's really hard to interpret what is fair in this use case... we're trying to find some algorithm to at least give an intuitive sense of fairness in most cases." — Omri +"This is an interesting gap I've found in a lot of the cloud-native schedulers. When I think back to something like Slurm... a user gets X amount of compute hours, and they're free to use that for how long they want." — Kevin +"If I used one GPU hour last night when nobody was using the cluster, it feels intuitively like I shouldn't be as punished as someone who used the GPU hour when the cluster was fully utilized." — Omri diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-08-26.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-08-26.md new file mode 100644 index 000000000..6c028490a --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-08-26.md @@ -0,0 +1,107 @@ +# 📅 Aug 26, 2025 + +## 👥 Attendees + +- Albert Reuther - MIT Lincoln Laboratory Supercomputing Center (LLSC) +- Alex Scammon - G research / Insight Softmax (meeting host) +- Dan Desjardins - Distributive +- Alan Mutschelknaus - SchedMD (SchedMD) +- Diego Ciangottini - INFN (National Institute for Nuclear Physics, Italy) +- Filip Novovic - Insight Softmax (with Alex) +- Omri Cohen - Run.ai / NVIDIA +- Vivian Hafener - SchedMD +- Dennis Marttinen - CSC (Finnish national/EuroHPC supercomputer provider), working on Lumi supercomputer +- Abhishek Malvankar - IBM +- Erez Freiberger - NVIDIA + +## Discussion Notes + +👋 Welcome & Introductions (“Hello, why are you here?”) +🔍 Topic: Interactive and Urgent HPC Scheduling +Albert Reuther introduced MIT Lincoln Lab's approach to scheduling for rapid prototyping DoD workloads that require fundamentally different scheduling than traditional batch HPC. +Paper: https://arxiv.org/pdf/2401.14550 +Runs Interactive and Urgent HPC Workshop at SC and ISC conferences. +Urgent ≠ Interactive alone — urgent means sensor-driven, deadline-constrained workloads +Real example: Fusion tokamak experiments +Runs periodically (not on fixed schedule) +8-minute run generates data +Must process in 10-15 minutes for next run +Requires "elbows out" priority — displace running jobs immediately +Key phrase: Jobs must "elbow out" other work when sensor data arrives +MIT campus cluster experiment revealed critical issue: perception of scarcity drives hoarding behavior +Observed users: +Reserving full allocations (32 nodes) +Running their own schedulers on allocated resources +Squatting on nodes "just in case" +Result: Cluster became fully batch-oriented, losing interactive capabilities +Core insight: "It is truly a perception. You have to maintain a perception that almost all the time, resources will consistently be there." +📊 Solution: Statistical Allocation Management +Strategy: Artificially constrain per-user allocations to maintain buffer +Example: 700-node partition, max 24 nodes per user +Result: 60-80% utilization with 20% always available for next launch +Leadership challenge: Took 10 years to convince management this was better than 95%+ utilization +Albert's philosophy: "Our utilization is how well our users are using the system, not how much of the compute nodes are consistently busy." +No time limits on most partitions (radical departure from batch HPC) +Makes forecast scheduling impossible ("complete BS" per Albert) +But eliminates scarcity anxiety +Trust-based system: "Use your share, but be kind" +Long-running inference jobs (weeks) are tolerated +System engineers monitor: no activity for 3+ days → email user +Typical response: "Yeah, sorry, man. I'll kill it." +Key requirement: All users under same organization — cultural grooming possible +Whole node scheduling — better performance + security (SSH sessions) +"Spot queue" — background jobs that can be killed in 10-15 seconds +Must use checkpointing +Restartable on preemption +Inspired by hyperscaler spot instances +Fractional node partitions — for Jupyter notebooks, small debugging jobs +Rejected approaches: +Virtual memory/hibernation (too slow, complex with MPI) +Over-provisioning without statistical control +🔧 Advanced Ideas: Micro-VMs + SRIOV (Dennis) +Dennis Marttinen (CSC/Lumi supercomputer) proposed SRIOV for GPU sharing: +Hardware-isolated VRAM per micro-VM +Shared compute capacity +QoS controls (e.g., interactive gets 80% priority) +Separate micro-VMs for Slurm vs Kubernetes +Avoids mixing schedulers while sharing hardware +Albert's reaction: Interesting direction, aligned with containerization trends +⚠️ Fault Tolerance (Abhishek's question) +Short job duration = higher failure tolerance +Users report failures immediately +Heavy users know --exclude flag +Advantage: Most jobs run hours-to-days, not weeks-to-months (unlike DOE) +Recent issue: ECC errors on GPUs — prototyping nature allows more tolerance +Originally: No memory management → nodes wedged unexpectedly +Current: 8 GB per core guardrail (double the 4 GB norm) +Slowly ratcheting down toward strict cgroup enforcement +User estimates highly variable ("My job takes 64 gigs" → actually dozens either side) +External research (Jay McLaughlin, Tufts): No job > 6 hours without checkpoint +Forces restartability, creates preemption opportunities +Some apps (VASP molecular dynamics) make this extremely hard +Albert's response: "We're still gonna try... This is not an excuse. We're doing it for your own good." +Related: Memverge does multi-machine checkpointing +🌐 Multi-Site Scheduling (Dan's question) +Lincoln Lab: Single site only, sometimes run out of resources +Globus as best example of multi-site scheduling with data movement +Alex noted: LSF Symphony one of few schedulers that talks to data layer (requires IBM Spectrum storage) +Containerized and getting more containerized +Using Singularity +Evaluated Kubernetes as orchestration layer (not for running workloads) +Potential: Use K8s to steer users into different environments, still submit through Slurm +Not right fit yet if not containerizing workloads +Future potential: Map K8s namespace with "must run now" workloads to Slurm partition +Link: https://github.com/SlinkyProject/slurm-bridge +How to balance interactive responsiveness vs batch efficiency in multi-tenant environments? +Can trust-based cultural approaches scale beyond single-organization contexts? +What role for micro-VMs and hardware isolation (SRIOV) in future HPC? +Should we forcibly checkpoint all long-running jobs to enable preemption? +Is 60-80% utilization with guaranteed responsiveness better than 95%+ batch utilization? +Diego (InterLink Project): Schedule presentation on K8s API overlay for Slurm clusters (EuroHPC) +Link: https://interlink-project.dev/ +Albert: Advocate for checkpointing research presentation at SC workshop (November) +Working Group: Connect with DOE colleagues doing urgent workload scheduling +All: Review urgent HPC paper: https://arxiv.org/pdf/2401.14550 +"You can accommodate batch jobs in a combined interactive and batch environment, but if you're gonna schedule full batch, it's gonna be really difficult to claw away interactive nodes." — Albert +"We trust the users that they will do their job, run their job for as long as they need to, not as long as they might want to, and then get off." — Albert +"Sounds a bit too socialist for me... Being nice to people?" — Alex (joking about trust-based approach) diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-09-09.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-09-09.md new file mode 100644 index 000000000..2512737c2 --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-09-09.md @@ -0,0 +1,78 @@ +# 📅 Sep 9, 2025 + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Abhishek Malvankar (IBM) [host] +- Marlow Warnicke (SchedMD) [host] +- Alan Mutschelknaus (Microsoft) +- Alan Parry (Microsoft) +- Ben Deitch (NVIDIA) +- Joseph Sirak (NVIDIA) +- Niranjan Ravichandra (Microsoft) +- Oleg Avdeev (Independent / CNCF ecosystem contributor) +- Roman Baron (Intel) +- Vivian Hafener (Google) +- William Arndt (Research Computing / National Lab ecosystem) + +## 📋 Agenda + +- 👋 Welcome & Introductions (“Hello, why are you here?”) + +## Discussion Notes + +Hybrid HPC + Kubernetes Environments +Common pattern described: traditional batch systems still doing core compute +Kubernetes used around the edges (services, orchestration, pipelines) +[Alan M] notes this split is very common in enterprise and research settings +Several people nodding to “two control planes” being the reality today +Scheduling Semantics +Kubernetes scheduling still struggles with tightly coupled workloads +MPI-style jobs remain difficult to express cleanly +Gang scheduling comes up repeatedly +[Ben D] mentions that everyone seems to reinvent similar mechanisms +[Roman B] notes differences in how projects interpret “gang” semantics +Feels like a lot of solutions are bolt-ons rather than native concepts +Hard to reason about correctness across implementations +Networking Constraints +Networking quickly becomes a major thread +[Joseph S] raises latency sensitivity as a first-order concern for training workloads +Current Kubernetes scheduling largely ignores network topology +[Alan P] points out that HPC schedulers treat networking as fundamental, not optional +Difficulty expressing: +proximity +fabric awareness +collective communication needs +Observations that networking is often invisible until performance is bad +Resource Modeling +Discussion shifts into resource description +GPUs already complicated; networking and memory locality even harder +[Niranjan R] notes that current resource models don’t reflect real constraints +Hard to express things like: +shared bandwidth +topology-aware placement +memory hierarchy +Several comments about avoiding scheduler-specific extensions +Desire for something portable across projects +Ecosystem Fragmentation +Multiple people mention solving the same problems in isolation +[Oleg A] notes lack of shared vocabulary across projects +Makes upstream communication difficult +Hard to say “this is what batch needs” when every project frames it differently +Results in diluted signal when engaging Kubernetes SIGs +AI/ML zas a Forcing Function +AI/ML training workloads repeatedly mentioned +These workloads expose HPC-style requirements very clearly +[William A] notes that many issues are not new — just newly visible +Feels like an opportunity to align HPC and AI conversations +Problems now showing up at much larger scale +Coordination & Next Conversations +Clear agreement that nobody wants to create another scheduler +Value is in comparison, coordination, and shared understanding +Interest in future deep dives: +networking +scheduling semantics +topology-aware placement +[Vivian H] suggests this space is useful for surfacing patterns before proposing solutions +General sentiment that these conversations are already happening informally +Better to have a shared place to capture them diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-09-23.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-09-23.md new file mode 100644 index 000000000..e1ec1d58c --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-09-23.md @@ -0,0 +1,82 @@ +# 📅 Sep 23, 2025 + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Abhishek Malvankar (IBM) [host] +- Marlow Warnicke (SchedMD) [host] +- Alan Mutschelknaus (Microsoft) +- Alan Parry (Microsoft) +- Ben Deitch (NVIDIA) +- Joseph Sirak (NVIDIA) +- Niranjan Ravichandra (Microsoft) +- Oleg Avdeev (Independent / CNCF ecosystem contributor) +- Roman Baron (Intel) +- Vivian Hafener (Google) +- William Arndt (Research Computing / National Lab ecosystem) +- Antonio Ojea (Red Hat) +- Tenzen-y (Independent / MPI Operator contributor) + +## 📋 Agenda + +- 👋 Welcome & Introductions (“Hello, why are you here?”) + +## Discussion Notes + +Introductions +Call starts with brief hellos and confirmations of audio +Focus today is networking and batch / HPC-style workloads +Participants joining specifically from MPI and DRA-related work +Networking as a first-class concern +Early framing that networking is often treated as secondary in Kubernetes +[Antonio Ojea] explains that most schedulers today assume networking “just works” +In practice, this breaks down badly for tightly coupled workloads +Networking is not just bandwidth, but topology, locality, contention, and collective behavior +[Tenzen-y] notes that MPI workloads are extremely sensitive to placement +Even small topology mismatches can have outsized performance impact +MPI operator perspective +[Tenzen-y] walks through how MPI jobs behave in real environments +MPI assumes a fairly stable, predictable network model +Kubernetes introduces churn and abstraction layers that MPI is not designed for +Placement often matters more than raw compute availability +Hard today to express “these pods need to be close” +MPI operators frequently reimplement scheduling logic themselves, leading to duplication and brittleness +DRA Net discussion +[Antonio Ojea] describes motivations behind DRA Net +Goal is to expose network devices and capabilities more explicitly +Treat networking resources more like accelerators +Make network capabilities schedulable rather than implicit +Avoid relying entirely on opaque CNI behavior +Open question around how much fidelity is realistic in early models +Scheduler integration challenges +Discussion on where responsibility should live: scheduler vs operator vs resource model +[Alan Mutschelknaus] raises portability concerns across schedulers +Risk of approaches that only work in a single stack +[Ben Deitch] draws parallels to early GPU scheduling approaches +Similar pattern of out-of-tree logic emerging +Kubernetes resource model limitations +Repeated theme that current abstractions are insufficient +Networking does not map cleanly to scalar resources +[Niranjan Ravichandra] raises concern about overfitting to current hardware +Need abstractions that can survive future network designs +Discussion of shared vs exclusive allocation models +Network behaves as shared-but-constrained, which is difficult to encode +AI/ML training workloads +AI/ML training cited repeatedly as a concrete driver +Large distributed training jobs exhibit MPI-like behavior +[Joseph Sirak] notes networking bottlenecks often dominate overall cost +Poor placement can waste significant GPU time +Reinforces need for networking visibility in scheduling decisions +Ecosystem alignment +Similar conversations happening across different groups +Risk of parallel efforts diverging +Value in aligning language and problem framing +[Vivian Hafener] highlights importance of sharing real constraints upstream +Strong desire to avoid premature API design +Focus should remain on understanding the problem shape first +Follow-ups and future threads +Interest in deeper dives on topology-aware scheduling +Collective communication modeling as a recurring topic +Interaction between DRA and schedulers called out +General agreement that continued MPI–Kubernetes dialogue is valuable +Sense that this is a long-overdue cross-ecosystem conversation diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-10-07.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-10-07.md new file mode 100644 index 000000000..8cbcb5f72 --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-10-07.md @@ -0,0 +1,79 @@ +# 📅 Oct 7, 2025 + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Abhishek Malvankar (IBM) [host] +- Marlow Warnicke (SchedMD) [host] +- Dan Desjardins (Microsoft) +- Filip Novovic (Independent / Kubernetes contributor) +- Niranjan Ravichandra (Microsoft) +- Tommy Aldo Sonin (Google) +- Victor Lu (Microsoft) +- William Arndt (Research Computing / National Lab ecosystem) + +## 📋 Agenda + +- 👋 Welcome & Introductions (“Hello, why are you here?”) + +## Discussion Notes + +Introductions +Smaller group today +Mix of Microsoft and Google participants, plus research/HPC representation +Conversation stays very focused and technical +Framing the problem space +Early discussion centers on why batch workloads continue to feel “unnatural” in Kubernetes +[Dan Desjardins] notes that many teams still treat batch as a special case rather than a first-class workload type +Observation that most platform assumptions are optimized for long-running services +Batch jobs often feel bolted on afterward +Workload characteristics +[William Arndt] describes characteristics common in research and HPC workloads +tightly coupled jobs +sensitivity to placement +predictable execution windows +These workloads behave very differently from microservices +Scheduling delays or misplacement have outsized impact +Scheduling behavior +[Niranjan Ravichandra] discusses scheduler behavior under contention +Default scheduling policies prioritize fairness and throughput +Batch workloads often want predictability instead +Tension between cluster-wide efficiency and job-level performance +Hard to express intent today +Queueing and prioritization +Discussion around queueing models +[Victor Lu] mentions that many systems reimplement queueing semantics outside Kubernetes +Often handled by higher-level controllers or external systems +Results in fragmented control loops +Makes debugging scheduling decisions difficult +User intent vs system behavior +[Filip Novovic] raises issue of intent expression +Users can say “I need X resources” but not “I need them arranged this way” +Lack of expressive primitives for: +job shape +co-scheduling +affinity beyond simple labels +Forces platform teams to encode policy implicitly +Operational pain points +[Tommy Aldo Sonin] notes that failures are often non-obvious +Jobs technically schedule but perform poorly +From a user perspective, system looks “correct” but results are bad +Makes root cause analysis extremely difficult +Particularly painful for ML training workloads +Observability gaps +Several comments about lack of visibility +Hard to understand why a job was placed the way it was +[Dan Desjardins] mentions desire for scheduler explanations +Similar to “why did this pod land here?” but at job level +Relationship to upstream Kubernetes +Discussion around where these problems should be addressed +Some belong in scheduler behavior +Some in APIs +Some potentially in tooling around scheduling +Concern about scattering responsibility across too many SIGs +Closing thoughts +General agreement that batch workloads stress Kubernetes in predictable ways +These issues recur across organizations +Value seen in capturing these patterns clearly +Interest in continuing with focused discussions +Especially around expressing workload intent and improving scheduler transparency diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-10-21.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-10-21.md new file mode 100644 index 000000000..3a92d41ae --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-10-21.md @@ -0,0 +1,59 @@ +# 📅 Oct 21, 2025 + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Abhishek Malvankar (IBM) [host] +- Marlow Warnicke (SchedMD) [host] +- Dan Desjardins (Microsoft) +- Filip Novovic (Independent / Kubernetes contributor) +- Niranjan Ravichandra (Microsoft) +- Victor Lu (Microsoft) +- William Arndt (Research Computing / National Lab ecosystem) + +## 📋 Agenda + +- 👋 Welcome & Introductions (“Hello, why are you here?”) + +## Discussion Notes + +Batch workloads in practice +[William Arndt] describes continued friction running research-style workloads on Kubernetes +Batch users care deeply about predictability +Start time and placement matter more than raw elasticity +Many workloads are run as “experiments,” not services +Retries and restarts are often expensive +Scheduling expectations vs reality +[Niranjan Ravichandra] notes mismatch between Kubernetes scheduler goals and batch user expectations +Scheduler optimized for fairness and throughput +Batch users often want reservation-like semantics +Difficulty expressing “run when everything is ready” +Leads to partial scheduling and wasted resources +Queueing and coordination +[Victor Lu] discusses queueing approaches layered on top of Kubernetes +Many teams effectively build queue managers externally +Results in multiple control loops acting on the same cluster +Hard to reason about ownership of decisions +Failures can cascade between systems +Job-level semantics +Discussion around lack of true job abstraction in Kubernetes +Pods remain the fundamental unit +Batch users think in terms of jobs, not pods +[Dan Desjardins] notes this mismatch shows up repeatedly +Tooling often forced to reconstruct job state externally +Observability and debugging +Difficulty understanding why jobs behave the way they do +Placement decisions opaque +Scheduling delays hard to attribute +Desire for clearer signals from the scheduler +Not just what happened, but why +Upstream implications +Discussion around where these concerns belong upstream +Some feel this spans SIG Scheduling, SIG Node, and workload APIs +Risk of fragmentation +Agreement that concrete examples help drive upstream change +Closing discussion +Recognition that same themes recur each meeting +Predictability, intent expression, and transparency remain central issues +Value seen in continuing these conversations +Particularly useful for shaping upstream discussions with real workload data diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-11-18.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-11-18.md new file mode 100644 index 000000000..322e7852b --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-11-18.md @@ -0,0 +1,62 @@ +# 📅 Nov 18, 2025 + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Abhishek Malvankar (IBM) [host] +- Marlow Warnicke (SchedMD) [host] +- Tommy Aldo Sonin (Insight Softmax Consulting) +- Dejan Zele Pejchev (G-Research) + +## 📋 Agenda + +- 👋 Welcome & Introductions (“Hello, why are you here?”) + +## Discussion Notes + +Context setting +Conversation opens with discussion of recent CNCF and KubeCon activity +[Abhishek] mentions the scale of recent events and number of attendees +Framing around how batch and HPC topics continue to surface in many forums +HPC and batch visibility +[Dejan] notes that batch and HPC workloads are increasingly visible again +Not new problems, but resurfacing due to AI and training workloads +Sense that many of these issues never fully went away +Discussion that these workloads historically lived outside Kubernetes +Now being forced back into cloud-native conversations +Practical workload behavior +[Tommy] talks about how real batch workloads behave very differently from services +Jobs often have strong expectations around start time and coordination +Late or partial starts can invalidate results +Emphasis on “runs as an experiment,” not something continuously retried +This mindset doesn’t align well with default Kubernetes behavior +Scheduling expectations +Discussion around how Kubernetes assumes continuous reconciliation +Batch jobs often want a single coordinated moment of execution +[Dejan] points out that schedulers do not naturally think in terms of “all-or-nothing” +Leads to jobs starting in degraded states +Difficulty expressing intent like: +don’t start unless everything is ready +fail fast instead of partially running +Queueing and control loops +[Tommy] mentions that queueing almost always exists outside Kubernetes +External systems decide when a job should run +Kubernetes decides where, often without understanding job-level intent +Multiple control loops again seen as a source of confusion +Hard to reason about ownership of decisions +User experience and debugging +Discussion on how confusing this is for end users +From user perspective, system looks broken +From platform perspective, system is behaving “as designed” +[Dejan] notes that explaining this gap consumes significant engineering time +Often not bugs, but mismatched mental models +Broader ecosystem implications +Recognition that these issues appear across organizations +Not specific to any one scheduler or platform +[Abhishek] suggests these recurring themes are useful signals +Particularly when engaging upstream groups +Closing thoughts +Agreement that documenting these conversations is valuable +Patterns repeat across meetings and contexts +Sense that batch workloads consistently expose blind spots in current abstractions +Notes seen as helpful for shaping future upstream discussions diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-12-02.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-12-02.md new file mode 100644 index 000000000..bf6c4c859 --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2025-12-02.md @@ -0,0 +1,72 @@ +# 📅 Dec 2, 2025: Slurm Topology Awareness + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Abhishek Malvankar (IBM) [host] +- Marlow Warnicke (SchedMD) [host] +- Tim Wickberg (SchedD) +- Marlow Warnicke (Google) +- Dan Desjardins (Distributive) +- Abhishek Malvankar (Microsoft) +- Barrett Abel (Google) +- Kevin Hannon (Red Hat) +- Tommy Aldo Sonin (Insight Softmax Consulting) + +## 📋 Agenda + +- 👋 Welcome & Introductions (“Hello, why are you here?”) +- Topology by Tim Wickberg + +## Discussion Notes + +Conference reflections +[Tim Wickberg] mentions seeing recurring batch and scheduling themes at QCon +Observations that many talks still treat batch as a niche problem +Despite growing AI and training workloads +[Marlow Warnicke] notes that scheduling conversations keep resurfacing across different venues +Same questions, different framing +Batch workloads resurfacing +Discussion that batch and HPC problems never really went away +They were simply less visible for a while +Now resurfacing strongly due to AI training and large-scale data processing +[Dan Desjardins] comments that Kubernetes assumptions remain heavily service-oriented +Batch continues to feel like a second-class workload +Job semantics vs pod semantics +[Tommy Aldo Sonin] reiterates that batch users think in jobs, not pods +A job either runs correctly or it doesn’t +Partial execution often produces unusable results +Kubernetes focuses on eventual convergence +Batch workloads often want a single, coordinated execution moment +Scheduling expectations +Discussion around mismatch between scheduler goals and user intent +Schedulers optimize utilization and fairness +Batch users care about predictability and correctness +[Abhishek Malvankar] notes that these tensions show up repeatedly in upstream conversations +Hard to represent “don’t start until everything is ready” +Queueing and admission control +[Dan Desjardins] describes how many real systems implement queueing externally +Admission control handled outside Kubernetes +Scheduler only sees pods once they are released +Leads to fragmented control loops +Makes it difficult to reason about ownership when things go wrong +Observability and explanations +[Barrett Abel] raises lack of transparency around scheduling decisions +Users often ask “why is my job waiting?” +System provides little actionable explanation +Distinguishing between lack of resources vs placement constraints is difficult +AI/ML pressure +AI training workloads continue to amplify all of these issues +GPU time is expensive and highly sensitive to placement quality +[Kevin Hannon] notes that inefficient scheduling directly translates into wasted capital +Makes these problems more visible to leadership +Upstream communication +Discussion around how to present these issues upstream +Abstract complaints don’t land well +Concrete workload examples resonate much more +Agreement that recurring themes across meetings are valuable signals +Closing thoughts +Strong sense that the same core issues continue to repeat +Job semantics, intent expression, queueing, and transparency remain central +Value seen in continuing to document these discussions +Notes increasingly useful as historical record and upstream input diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-01-13.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-01-13.md new file mode 100644 index 000000000..29ab8ba89 --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-01-13.md @@ -0,0 +1,20 @@ +# 📅 Jan 13, 2026: Networking! (Postponed) + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Abhishek Malvankar (IBM) [host] +- Marlow Warnicke (SchedMD) [host] +- Jonathan Skone (NERSC) +- Filip Novovic (ISC) + +## 📋 Agenda + +- 👋 Welcome & Introductions (“Hello, why are you here?”) +- Benchmarking! +- Scheduling AI workloads (10:30 CST on Wednesdays) +- Networking! + +## Discussion Notes + +No recording of this one – and one of our presenters didn’t show up so we rescheduled for January 27th. diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-01-27.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-01-27.md new file mode 100644 index 000000000..cf8a70a2b --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-01-27.md @@ -0,0 +1,32 @@ +# 📅 Jan 27, 2026: New Year’s Plans + +📽️ [Recording](https://www.youtube.com/watch?v=d1aAQQ1KNlE) +🤖 [AI Summary](https://zoom-lfx.platform.linuxfoundation.org/meeting/99965231171-1769529600000/summaries?password=ec727820-92aa-47e9-80c3-73f4dfb8ee66) + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Abhishek Malvankar (Red Hat) [host] +- Marlow Warnicke (NVIDIA) [host] + +## 📋 Agenda + +- 👋 Welcome & Introductions ("Hello, why are you here?") + +## 📝 Quick Recap + +The meeting began with Filip and James introducing themselves and discussing technical difficulties with accessing the meeting link, which was eventually resolved when James shared a new link for Batch WG. The conversation then shifted to addressing broader technical issues where some participants were experiencing difficulties joining the correct Zoom call, including the need to create new user accounts. The group ultimately decided to switch to the new Zoom call, with Abhishek taking the lead to join the other meeting and determine the next steps. + +## ➡️ Next Steps + +- Abhishek: Ask Antonin if he can create an LFID account and join the new meeting link. + +## 📋 Summary + +### First WG Meeting Introductions + +Filip and James discussed technical difficulties with accessing a meeting link, which was resolved when James shared a new link for Batch WG. They briefly introduced themselves and their backgrounds, with Filip mentioning he is from Serbia and works at IC Compute, and James noting he is from Nepal and works at his university. The conversation ended with both expressing enthusiasm about participating in their first WG meeting. + +### Zoom Meeting Access Issues + +The meeting participants experienced technical difficulties joining the correct Zoom call, with some being directed to create new user accounts while others were on the old meeting link. Abhishek mentioned that the speaker was having trouble accessing the old Zoom meeting, and he attempted to resolve the issue by asking Antonin to create an account for the new meeting. The group decided to switch to the new Zoom call, with Abhishek taking the lead to join the other meeting and determine the next steps. diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-02-10.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-02-10.md new file mode 100644 index 000000000..26cd67666 --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-02-10.md @@ -0,0 +1,68 @@ +# 📅 Feb 10, 2026: DRANet and the MPI Operator + +📽️ [Recording](https://www.youtube.com/watch?v=BCSH0ZExpJI) +🤖 [AI Summary](https://zoom-lfx.platform.linuxfoundation.org/meeting/99965231171-1770739200000/summaries?password=fd8f1329-9cf2-4fd8-8a6f-b1f7a2f7f86b) + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Abhishek Malvankar (Red Hat) [host] +- Marlow Warnicke (NVIDIA) [host] + +## 📋 Agenda + +- 👋 Welcome & Introductions ("Hello, why are you here?") +- DRANet and the MPI Operator + +## 📝 Quick Recap + +The meeting focused on DRANet and the MPI operator, with Antonio explaining how DRANet enables dynamic resource allocation and device-level topology awareness in Kubernetes. He described how the system exposes hardware resources through drivers and allows users to request specific combinations of GPUs and NICs without manual node labeling. The discussion covered how DRANet works with various drivers, including NVIDIA's GPU driver, and how it can be integrated with batch scheduling systems. Kevin raised questions about user adoption and compatibility with existing scheduling systems, while Marlow inquired about benchmarking against bare metal setups. The group also discussed upcoming topics including workload description, topology awareness, and integration with other systems like Slurm. + +## ➡️ Next Steps + +- James: Share and encourage team members to review and edit the benchmarking documentation for AI Batch Novacy; team members are invited to check and improve the docs. +- Marlow: Continue updates and coordination on the AI Batch White Paper; team members are encouraged to add material or updates as assigned. +- Marlow: Schedule and hold the next AI Batch White Paper working meeting (noted as tomorrow at 10:30 Central/8:30 Pacific); share calendar link as needed. +- Kevin: Ensure Kueue is updated to evaluate cell expressions for DRA integration, particularly to support resource claim templates with device matching. +- Marlow: Play with DRANet and related components (e.g., with Kubeflow) to evaluate performance and integration with existing schedulers, and compare with bare metal. +- Marlow: Follow up with Tim to schedule his return to discuss how Slurm handles topology awareness. +- Alex/Victor: Coordinate with OCP AI Hardware Software Co-Design Working Group to invite a speaker to present on their work to this group. +- Kevin: Present on the decoupled pod group and workload API proposal at an upcoming meeting (queued up by Alex). + +## 📋 Summary + +### New Platform for Meeting Accessibility + +The meeting focused on transitioning to a new platform to improve meeting accessibility and note-taking processes. Alex explained that the new system would generate automatic meeting notes and YouTube links, reducing manual effort. Antonio mentioned an existing example of the DRANet with MPI operator in the documentation, which Marlow requested to review. The meeting experienced some technical difficulties, with participants joining different Zoom meetings, but these issues were eventually resolved. + +### AI Scheduling White Paper Progress + +Marlow discussed the progress of a white paper on scheduling AI and Kubernetes, which is near completion and has been updated with help from Sabrina. The team meets weekly to work on the paper, with the next meeting scheduled for the following day. Marlow also mentioned that he has three presentations at KubeCon Europe, including a panel discussion on native resources, a Slurm Bridge presentation on DRA and CPU plugin demo, and a talk on Workloads Foundation. + +### KubeCon Europe Presentation Planning + +The group discussed upcoming activities at KubeCon Europe, where Kevin will present on workload-aware scheduling efforts in Kubernetes and rename the Kueue booth to highlight this focus. Antonio mentioned he would attend but primarily focus on networking, while James introduced a new benchmarking initiative for batch scheduling. The team also touched on ongoing work related to the AI Batch White Paper and DRA/MPI operator projects. + +### Kubernetes Telco Networking Enhancements + +Antonio explained the work being done at Google on Kubernetes to improve networking for telco workloads, focusing on PCI routing between GPUs and NICs within nodes and across data center layouts. He described a new model of driver composition that allows users to match GPUs and NICs within the same PCI, which was released in version 1.34. Antonio also discussed plans to implement device-level network topology matching for more complex and elastic cloud environments, working with NVIDIA on this initiative. The discussion touched on how RANet aims to be application-agnostic, leaving MPI and other distributed communication frameworks to handle topology discovery and communication. + +### DRANet: Device-Level Workload Subdivision + +Antonio explained the concept of DRANet, which allows users to create workloads based on device-level topology instead of using entire nodes, effectively subdividing nodes into multiple workloads. Abhishek asked about the alignment capabilities, and Antonio clarified that while DRANet currently supports GPUs and NICs, there is ongoing work to extend this to CPU and NUMA node levels, with Praveen working on a DRA CPU driver. The team is still finalizing the parameters and agreeing on how different platforms expose NUMA, with plans to release new features later this year. + +### GPU Monitoring Automation Challenges + +Antonio discussed the challenges of exposing raw metrics for GPU monitoring and emphasized the need for an automated system that can react to issues, potentially involving self-healing mechanisms. He mentioned ongoing discussions with Google's PM and a project called Straggler that offers similar capabilities, but more exploration is required. Kevin raised a question about the interaction between DRA drivers from different vendors, and Antonio explained the goal of building applications that can compose workloads across different devices, leveraging common structures and APIs for network interfaces. + +### NVIDIA DRA Integration in Kubernetes + +The group discussed the integration of NVIDIA DRA drivers and resource claims for GPU and NIC management in Kubernetes. Antonio explained how the system works with separate resource claims for networking devices and GPUs, with drivers publishing resource objects that the scheduler can use for allocation. Marlow inquired about benchmarking MPI performance compared to bare metal, and Antonio mentioned internal development benchmarks but noted they are not yet public. Kevin raised questions about Kueue's ability to handle resource claim templates, particularly for DRA integration, and the need for Kueue to evaluate cell expressions. + +### Batch System Scheduling Evolution + +Antonio and Alex discussed the evolution of batch system scheduling, focusing on the transition from manual node labeling to a more dynamic and automated approach. Antonio explained how drivers now handle label discovery and result discovery, simplifying the process for end users who can request specific device classes without needing detailed infrastructure knowledge. Marlow expressed interest in testing the system with MPI and Kubeflow to assess performance compared to bare metal, while Kevin raised questions about the integration with Kubernetes' workload-aware scheduling, particularly topology-aware scheduling. Antonio confirmed that the system aims to enable topology-aware scheduling at the device level and mentioned ongoing work with Boyt Tysinski and Marco to implement this feature, with a focus on defining internode networking requirements through APIs. + +### DRANet Adoption and Challenges + +The meeting focused on discussions about DRANet and its adoption, with Antonio highlighting the need for full-stack solutions and automatic controllers to drive user migration. Kevin expressed concerns about slow user adoption and the challenges of supporting extended resources. Alex mentioned upcoming topics, including Kevin's workload description proposal and Tim's discussion on topology awareness, while encouraging participants to review Marlow's white paper. The group agreed to reconvene in a few weeks with more related topics. diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-02-24.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-02-24.md new file mode 100644 index 000000000..8a14aa556 --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-02-24.md @@ -0,0 +1,79 @@ +# 📅 Feb 24, 2026: AI Scheduling Whitepaper Review + +📽️ [Recording](https://www.youtube.com/watch?v=iCqMqJ0Z9Cw) +🤖 [AI Summary](https://zoom-lfx.platform.linuxfoundation.org/meeting/99965231171-1771948800000/summaries?password=ec730155-2422-4711-9b62-473dc5ce12e5) + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Abhishek Malvankar (Red Hat) [host] +- Marlow Warnicke (NVIDIA) [host] + +## 📋 Agenda + +- 👋 Welcome & Introductions ("Hello, why are you here?") +- AI Scheduling Whitepaper Review + +## 📝 Quick Recap + +The meeting focused on reviewing and discussing a draft white paper on cloud-native batch scheduling, which needs to be split into four separate 10-page papers due to length constraints. The group, led by Marlow, reviewed the current document's structure and content, including sections on AI workloads, scheduling approaches, and infrastructure considerations. Sabrina was thanked for her work on organizing and structuring the paper. The team discussed the scope of the paper, particularly regarding training and inference workloads, and agreed to clarify the distinctions between different types of scheduling. They also considered how to present the various scheduling solutions in a way that would be useful for decision-makers. The conversation ended with plans for future discussions on topics such as DRA and workload management. + +## ➡️ Next Steps + +- Marlow: Email Sabrina to break up the white paper into 4-5 separate papers (each 10 pages or less), following the new length requirements. +- Sabrina: Split the current white paper into 4-5 individual papers (each 10 pages max), using different tabs in the same document link for easier review. +- Abhishek: Add a line or two to the white paper clarifying the distinction between scheduling for training/inference workloads and training/inference techniques themselves, and why certain items are out of scope. +- Marlow: Add a note to the white paper about recent changes in Kubernetes (e.g., workload API) and point to relevant documentation, to show awareness of recent developments. +- Alex: Modify the flowchart/table in the white paper to generalize references to traditional HPC schedulers (e.g., update "Slurm" to "traditional HPC scheduler") for neutrality. +- Marlow: Ensure all lists in the white paper are alphabetical before finalizing. +- Filip and James: Continue gathering documentation and resources for benchmarking, and keep the group posted on progress. +- Marlow: Follow up with the Green Reviews group to schedule a meeting and coordinate with Filip. +- Abhishek: Present/discuss the topic of how LLMD fits into the scheduling landscape at the next meeting. +- Marlow/Alex: Invite John Bellamarek to the next meeting to discuss DRA topics. +- Marlow/Alex: Invite Tim to a future meeting to discuss topology awareness after the DRA session. + +## 📋 Summary + +### White Paper Restructuring Discussion + +The team discussed their white paper, which needs to be split into four shorter papers due to length restrictions of 10 pages. Marlow explained they have been working on scheduling topics and cataloged tools, while Sabrina has been helping maintain the structure. The group reviewed their scope, which focuses on scheduling, and identified potential future topics including multi-cluster scheduling, model architecture, and security. They noted that while the current paper is 35 pages long, they will need to focus on key areas and exclude references to meet the new length requirements. + +### LRMD Scheduling Discussion Overview + +Alex and Marlow discussed the cost of resources and the need to include memory in their considerations. They also talked about a presentation given by Carlos from IBM about LRMD and its scheduler, which Abhishek clarified is a request scheduler for inference requests. Alex and Abhishek explored the differences between LRMD and other schedulers, with Abhishek describing it as being in between web request APIs and batch setups. They agreed that scheduling for LRMD was out of scope for their current paper. + +### Cloud-Native AI Scheduling Paper + +The team discussed the scope of their cloud-native AI white paper, particularly regarding training and inference techniques. They clarified that while training and inference methods are out of scope, the paper focuses on scheduling for request jobs, which is distinct from the actual training or inference techniques themselves. Abhishek explained the difference between infrastructure setup for pods and request scheduling, emphasizing that the paper addresses scheduling decisions for request jobs rather than the underlying training or inference processes. + +### Infrastructure Scheduling Paper Scope + +The team discussed the scope and content of a paper on infrastructure scheduling for training and inference workloads. Abhishek suggested adding a line to clarify that the paper focuses on bootstrapping infrastructure without addressing inference requests. Alex agreed this was necessary and mentioned the need to clarify the paper's scope, particularly regarding training techniques. The group decided to add a brief paragraph explaining the scope and limitations of the paper, without delving into specific training methods like reinforcement learning. + +### Document Updates and AI Workloads + +Marlow and Alex discussed updates to a document, noting that it may need an update regarding the workload API, which is still new. Sabrina added a section on pre-reading suggestions, which Marlow appreciated. They reviewed content on AI workloads, including data preparation, model development, training, and inference, while intentionally keeping the section brief due to the vastness of the topic. Marlow mentioned that some comments left for others to resolve had not received responses, and Alex acknowledged this. + +### Resource Lifecycle and Paper Split + +The team discussed resource characteristics across different lifecycle stages, including training and real-time inference workloads. They noted that their current white paper was exceeding the 10-page limit for white papers, prompting a decision to split the content into multiple papers. Sabrina was tasked with revising the paper to fit the new constraints, with a focus on scheduling fundamentals for the next installment. + +### Content Splitting and Reference Updates + +The team discussed splitting their content into multiple papers, agreeing that four or five sections would be more appropriate than compressing it into one 10-page paper. They confirmed that lists should remain alphabetical to avoid appearing biased. Marlow noted that while they had removed references to specific projects, there was an outstanding issue regarding Slurm, with Alex suggesting it could be modified to reference traditional HPC schedulers instead. + +### Document Splitting and Review Planning + +The team discussed splitting a document into four papers, each containing 10 or fewer pages, with the introduction becoming the first paper. Sabrina will create different tabs within the same link for reviewing the documents, and the team aims to finalize the documents in a week or two for pre-KubeCon release. Marlow mentioned that Philip and James are working on benchmarking, and he needs to follow up with Green Reviews. Filip noted that he is gathering documentation with James and Marlow, and they will keep the group updated. + +### Infrastructure Scheduling for Inference Workloads + +The team discussed the scope of their paper regarding infrastructure and platform scheduling, particularly focusing on inference workloads. Alex noted that while inference techniques were initially marked as out of scope, they should remain in the paper as they are discussed later. The group reviewed how inference workloads, including LLMD, are addressed in the context of scheduling infrastructure, and examined a table mapping different scheduling challenges to solutions. They agreed to have Arby provide further insights on any differences between LLMD and other scheduling approaches. + +### Scheduler Categorization Strategy Discussion + +The team discussed the complexity of scheduler categorization and how to present it to outsiders. Alex suggested creating higher-level categories to group similar schedulers together, making it easier for decision-makers to compare options. Tommy and Marlow agreed that the current table provides a good overview of the ecosystem's complexity, but could be enhanced by adding labels or categories to guide users towards specific solutions. The group also clarified that inference workloads are indeed discussed in the document, reversing a previous change. + +### Kubernetes Scheduler Analysis Planning + +The group discussed the challenges of categorizing different schedulers and agreed to draw a line on the Kubernetes landscape, acknowledging that further detailed analysis would be needed. They planned the agenda for upcoming meetings, including a discussion on workload management and topology awareness, with Abishek set to present on LMD's role in Kubernetes. The team expressed gratitude to James, Philip, and Sabrina for their contributions, and Alex mentioned that Sabrina's paper would be reviewed and potentially expanded upon. diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-03-10.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-03-10.md new file mode 100644 index 000000000..829b5da79 --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-03-10.md @@ -0,0 +1,66 @@ +# 📅 Mar 10, 2026: DRA Update + +📽️ [Recording](https://www.youtube.com/watch?v=579hf1YYlP8) +🤖 [AI Summary](https://zoom-lfx.platform.linuxfoundation.org/meeting/99965231171-1773154800000/summaries?password=6b99fcba-ab13-4061-9918-94193bc26160) + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Abhishek Malvankar (Red Hat) [host] +- Marlow Warnicke (NVIDIA) [host] +- John Bellamaric + +## 📋 Agenda + +- 👋 Welcome & Introductions ("Hello, why are you here?") +- DRA Update + +## 📝 Quick Recap + +The meeting focused on discussing DRA (Device Resource Allocation), where John Bellamaric provided an overview of Kubernetes' enhanced capabilities for managing accelerators and specialized devices. The discussion covered how DRA enables more sophisticated scheduling through resource slices and claims, including support for MIG (Multi-Instance GPU) partitioning and the ability to express preferences and alternatives for resource allocation. Participants explored how external schedulers can interact with DRA's APIs, including the use of Kubernetes' watch mechanism for tracking resource claims, and discussed potential scaling challenges with complex preference matrices. The conversation also touched on how DRA's approach to resource management differs from traditional Kubernetes scheduling, with John explaining that resource slices can be modified without disrupting existing allocations. + +## ➡️ Next Steps + +- Marlow: Re-ping the TOC member who promised to review the white paper and follow up to ensure review is completed. +- Alex Scammon: Send details about the Armada kiosk and open house at KubeCon (including Slack invite/connection info) to Alex Kimber. +- Alex Kimber: Check with his colleague about attending KubeCon and send him Alex Scammon's way if attending, and follow up regarding the Kubernetes plugin development for the open resource broker. +- Alex Scammon: Put information about the OCP group meeting (including John's April 17th presentation) in the Slack channel for others to join if interested. +- Team: Schedule and hold the next meeting to discuss the Workloads API with Kevin Hannon in April (after KubeCon). + +## 📋 Summary + +### Work Updates and Event Planning + +The meeting began with introductions and casual conversation among participants, including updates about previous presentations and ongoing work. Marlow mentioned that Sabrina had split a white paper into five different papers, though they were still waiting for a review from a team member. The group briefly discussed upcoming events, including GTC and KubeCon, with some participants noting scheduling conflicts. The conversation then shifted toward updating on work streams before diving into DRA-related topics with John, though the specific DRA discussion was not captured in this transcript segment. + +### White Paper and Benchmarking Updates + +The team discussed updates on a white paper project that was previously started by another working group but never completed. Sabrina successfully split the original 40-page document into five separate pieces, each under 10 pages, and added necessary sections including introductions and executive summaries. Filip provided an update on benchmarking work being led by James, which is currently in phase one and focuses on creating specific benchmarks and components for testing in cluster setups, with plans to eventually benchmark different schedulers and AI workloads similar to Linpack. The team also briefly mentioned the transition to a new platform and upcoming KubeCon event where Armada will have a presence. + +### KubeCon Participation and DRA Transition + +The team discussed upcoming KubeCon attendance, with Alan declining due to scheduling conflicts and John confirming his participation where he will present talks on Workgroup Device Management and Core DNS. John then provided background on his involvement with DRA, explaining that he joined about two years ago to help transition the design from an opaque controller system to one that would better support auto-scaling and allow external schedulers to make better resource allocation decisions. + +### Device Resource Allocation Overview + +John provided an overview of Device Resource Allocation (DRA), explaining how it evolved from initial issues with the autoscaler to its current GA status in Kubernetes 1.34. He described DRA's four main components: resource slices for publishing device information, the resource claim API for more flexible device requests, scheduler code to match these components, and node-side implementation. The discussion highlighted ongoing challenges with partitioning and current state monitoring, with Alex Scammon raising concerns about the complexity of evicting workloads and the need for frequent API polling by schedulers. + +### Device Request and Management System + +John presented an overview of a system for requesting and managing multiple devices across different vendors, including features like standardized metadata, device classes, and resource sharing. He explained how users can request specific devices using device classes and additional selection criteria, and demonstrated examples of GPU configurations and sharing mechanisms. John also discussed the possibility of managing multi-node devices and mentioned ongoing work on pod groups and gang scheduling in Kubernetes, which could enable automatic lifecycle management of resources like TPU slices. + +### MIG Support in Kubernetes + +John explained that the resource slice can support MIG (Multi-Instance GPU) by containing all possible GPU configurations and allowing the scheduler to dynamically allocate and reconfigure partitions without disrupting workloads. He noted that MIG support will be in alpha in Kubernetes 1.35 and beta in 1.36, with the GPU driver feature currently in alpha. John clarified that while dynamic MIG partitioning is possible, fragmentation management is not currently part of the upstream scheduler and would require additional implementation. + +### Kubernetes Resource Tracking Overview + +The team discussed tracking available resources in Kubernetes, focusing on how resource allocations are managed and monitored. John explained that resource slices represent capacity and don't change with allocations, while allocations are tracked through the scheduler's internal cache and watched via Kubernetes API's watch functionality. The discussion included details about a feature in development (PR 136) that would allow administrators to view resource pool capacity calculations, though this would be limited to namespace-confining users. + +### Etcd Resource Allocation Challenges + +John and Alex Scammon discussed challenges with etcd's resource allocation system, particularly around atomicity and race conditions when external schedulers are involved. John explained that while Kubernetes doesn't have transactional semantics like an RDBMS, they are exploring ways to allow external schedulers to make specific resource allocation decisions, either by reprogramming the request specifications or writing allocations directly. Alan shared that his company Yellow Dog, which operates independently on Kubernetes, faces similar challenges with managing multiple resource parameters and is developing a resource-based worker-less model for HPC workloads. + +### HPC Resource Scheduling Preferences + +The group discussed resource scheduling and preferences in HPC environments. John explained that their system supports prioritized alternatives in resource claims, allowing users to specify preferences for different hardware configurations with up to 8 alternatives. The discussion touched on scalability concerns and potential future needs for more complex preference management as GPU options expand. The conversation also covered the relationship between resource slices and allocations, with John clarifying that changes to resource slices don't disrupt existing allocations. diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-04-07.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-04-07.md new file mode 100644 index 000000000..c51ae3aa6 --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-04-07.md @@ -0,0 +1,54 @@ +# 📅 Apr 7, 2026: Workload-aware Scheduling + +📽️ [Recording](https://www.youtube.com/watch?v=7RPqSHz5U5U) +🤖 [AI Summary](https://zoom-lfx.platform.linuxfoundation.org/meeting/99965231171-1775574000000/summaries?password=c4975c8f-d9f4-4655-84c9-3d710d891ab8) + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Abhishek Malvankar (Red Hat) [host] +- Marlow Warnicke (NVIDIA) [host] + +## 📋 Agenda + +- 👋 Welcome & Introductions ("Hello, why are you here?") +- Workload-aware Scheduling (Workloads API) + +## 📝 Quick Recap + +The meeting focused on updates and discussions around workload-aware scheduling in Kubernetes, particularly highlighting new APIs introduced in versions 1.35 and 1.36. Kevin presented the workload and pod group APIs, demonstrating how they enable gang scheduling and address complex workload requirements like leader-worker configurations and resource sharing. The group discussed implications for external schedulers like Slurm and potential interactions with existing tools like Kueue, though some naming conflicts were noted between different workload APIs. James provided an update on their team's work on a white paper, which is awaiting CNCF infrastructure support, while Marlow shared that Slurm Bridge already implements workload-aware scheduling through job translation. The conversation ended with a brief discussion about hardware failure handling in AI workloads and potential future applications of these new scheduling features. + +## ➡️ Next Steps + +- Marlow: Follow up on approval for the white paper and determine where to publish it, potentially using the new GitHub repo. +- James: Continue work on the white paper, including documentation and infrastructure planning, and await CNCF support for infrastructure needs. +- Marlow: Continue seeking information on where the clusters live and identify the right person to contact for cluster resources. +- Marlow: Coordinate with Nikki (from Green Review) to discuss benchmarking with Filip and James. +- Alex: Consider inviting Boris to the next meeting (around April 21st) to discuss OpenGriss and potentially request AWS resources for James' team. +- James: Await CNCF support/help for infrastructure to proceed with benchmarking and related events. +- Alex: Continue efforts to automate or improve documentation/notes updating process in the new GitHub repo for better meeting notes management. +- Marlow: Wait for the merge of the draft translation of job types for Slurm Bridge, then complete any necessary updates after files change. +- Alex: Proceed with changing Batch WG/Slack channel naming to Batch Subproject for clarity across CNCF and Kubernetes communities. +- Tommy: Review the meeting recording for context on the new APIs and workload-aware scheduling discussion. (While not strictly a group action, Tommy explicitly committed to this follow-up.) + +## 📋 Summary + +### Benchmarking and White Paper Updates + +The team discussed updates on benchmarking and white paper initiatives. James mentioned they are working on benchmarking but are still waiting for CNCF introduction and approval for the white paper. Marlow reported that their paper is complete but awaiting approval to publish, with uncertainty about where to publish it. Alex provided updates on housekeeping efforts, including moving documentation to a new GitHub repository under the tags Workloads foundation section, where they can now store white papers and other materials. + +### Automation and Benchmarking Updates + +The team discussed updates on automation efforts and benchmarking activities. Filip announced that James had been accepted to CERN and that benchmarking work would accelerate over the summer. Marlow reported on KubeCon activities, including good attendance at the Sloan Bridge Batch talk and the Workloads Foundation talk, and mentioned that Nikki from the Green Review would be working with Filip and James on benchmarking. The conversation ended with Kevin preparing to present slides about workloads API, which he had previously discussed at the Working Group Batch. + +### Kubernetes Gang Scheduling Developments + +Kevin discussed the challenges and developments in gang scheduling within the Kubernetes community, particularly focusing on the complexities introduced in version 1.35 and the new features in 1.36. He highlighted three main workload types—jobs, job sets, and disaggregated serving—and explained how gang scheduling semantics vary across these use cases. Kevin introduced the concepts of workload-aware scheduling, which involves two new APIs: Workload and Pod Group. He emphasized the importance of moving beyond pod-level scheduling to group-level scheduling and discussed the vision of automating lifecycle management through controllers rather than leaving it to users. + +### Workload and Pod Group Updates + +Kevin discussed the relationship between workloads, pod groups, and jobs in version 1.37, explaining that a workload can contain multiple pod groups and that the pod group template informs how pod groups are created. He highlighted the importance of making min count mutable for elasticity and introduced workload-aware disruption, which allows for toggling pod group preemption. Kevin also explained the new feature of pod group resource claims, enabling multiple pods to share the same resource claim, and outlined the goal of workload-aware controllers to handle orchestration tasks. He mentioned upcoming work on topology-aware scheduling and gang jobs, and expressed hope for finding someone to take on the job set work to prove out higher-level controller functionality. + +### Kubernetes Workload-Aware Scheduling Features + +Kevin demonstrated new gang scheduling and workload-aware features in Kubernetes 1.36, including APIs for pod groups and workloads. He explained how these features avoid conflicts with existing Kueue workload APIs by using different naming conventions and full domain names. The discussion covered integration challenges with auto-scaling and cluster autoscaler, with ongoing work to create a scheduling library to streamline these connections. Marlow confirmed that Slurm Bridge already implements workload-aware scheduling through translation to Slurm jobs, while James mentioned their team is working on documentation and planning to get involved with infrastructure setup soon. diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-04-21.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-04-21.md new file mode 100644 index 000000000..57131ef32 --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-04-21.md @@ -0,0 +1,73 @@ +# 📅 Apr 21, 2026: OpenGRIS + +📽️ [Recording](https://www.youtube.com/watch?v=7H1dhpM0EaI) +🤖 [AI Summary](https://zoom-lfx.platform.linuxfoundation.org/meeting/99965231171-1776783600000/summaries?password=c0e38615-a312-452a-b33e-37dd6491f5ef) + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Abhishek Malvankar (Red Hat) [host] +- Marlow Warnicke (NVIDIA) [host] +- Boris Litvin (AWS) [presenter] +- Francisco Garau (Finos member) +- Vivian Hafener (NVIDIA) +- Mesut Oezdil (Open Source Contributor) + +## 📋 Agenda + +- 👋 Welcome & Introductions ("Hello, why are you here?") +- Benchmarking! +- Whitepapers! +- LF automation successes and failures 😀 +- OpenGRIS (Boris Litvin, AWS) + +## 📝 Quick Recap + +The meeting focused on discussing OpenGRIS, a project developed by Citibank to simplify cloud parallelization for quantitative analysts and engineers. Ritesh and Boris presented the tool, which allows developers to decorate functions for automatic parallelization and distribution across different cloud providers and schedulers without requiring explicit job construction. The discussion covered key features including multi-cloud support, hybrid on-premise and cloud execution, and integration with existing schedulers like AWS Batch and IBM Symphony. Participants raised questions about function limitations, performance estimation, monitoring capabilities, and resource management features, with Ritesh explaining that while some advanced features like prioritization and preemption are not currently implemented, they are on the roadmap. The conversation ended with plans to cover updates on DRA and Slurm topology awareness in future sessions, along with progress on benchmarking initiatives led by Marlowe. + +## ➡️ Next Steps + +- Alex: Schedule follow-up meeting with John Bellamarek to discuss latest updates in DRA +- Alex: Schedule follow-up meeting with Tim from Slurm/SCEDMD/Nvidia to discuss topology awareness for Slurm + +## 📋 Summary + +### Team Introductions and Updates + +The meeting began with introductions and casual conversation among participants including Alex, Marlow, Boris, Vivian, Filip, and Francisco. Francisco, who was new to this particular meeting, shared his background working at JP Morgan where he worked on grid systems, and mentioned he is currently unemployed and learning about AI tools. Alex noted that his message about the FINOS project had reached the FINOS community, resulting in a few additional participants joining the meeting. + +### FINOS Foundation Overview and Projects + +Alex explained that FINOS is a financial open source foundation under the Linux Foundation, run by Gabriel Colombo. He noted that while banks and sell-side firms dominate FINOS, the organization has been slow to embrace new technology due to focus on governance and security requirements. Francisco shared his experience at JP Morgan where FINOS provided legal framework for open source contributions, and Alex mentioned their team's work on Git Proxy, a project they are leading within FINOS. + +### OpenGRIS Cloud Parallelization Initiative + +Ritesh and Boris presented OpenGRIS, an initiative aimed at simplifying parallelization of applications to the cloud. Ritesh explained that OpenGRIS provides high-level interfaces that allow developers to automatically parallelize functions and programs without extensive manual configuration, supporting multiple cloud providers and on-premise environments. Boris highlighted the project's focus on improving developer experience and reducing the impact of switching between different schedulers, mentioning his contribution of AWS Batch support. The presentation concluded with a brief overview of OpenGRIS's capabilities and its potential to address multi-cloud and multi-platform orchestration needs. + +### Function Distribution Approach Discussion + +Ritesh and Boris explained their approach to function distribution, emphasizing the goal of maintaining a simple programming paradigm by allowing developers to decorate functions for parallel execution rather than requiring them to restructure their code as tasks. They discussed how the system estimates execution time using heuristics, acknowledging that perfect prediction is theoretically impossible due to the halting problem. Ritesh clarified that while they aim to do minimal scheduling, they can work with various backends including Symphony and cloud providers, adapting their approach based on whether the underlying system handles orchestration and scheduling. + +### Evolution of Intelligent Schedulers + +The discussion focused on the evolution of schedulers from simple dispatch systems to more intelligent, data-aware systems. Ritesh and Alex discussed how schedulers are increasingly incorporating features like topology awareness, data locality, and cost optimization, with examples from their work on Armada and Kueue systems. They explored the concept of demand shifting, where jobs could be held and rescheduled to optimize resource usage, particularly for workloads that can be delayed. Ritesh emphasized that while schedulers will become more intelligent, the approach should be to defer to backend systems where possible rather than trying to manage or orchestrate everything themselves. + +### Dispatch and Resource Management Concepts + +Abhishek discussed the concept of dispatch with Ritesh, clarifying that dispatch involves the scheduler bringing up appropriate resources, running workers, executing functions, and returning results before shutting down. Ritesh explained that they use AWS's Orb framework, which allows for integration with different cloud providers, and mentioned that Scalar provides a ray-compatible interface. Ritesh emphasized the benefits of using such tools for automatic resource management, stating that it reduces developer friction and handles complex orchestration tasks. + +### OpenGrease Job Orchestration Tool Discussion + +The discussion centered around OpenGrease, a tool for orchestrating jobs across different regions and clouds, including hybrid environments. Ritesh explained that the target audience includes quantitative modelers, analysts, and engineers who need to run parallel workloads in the cloud without managing individual cloud instances. Jonathan questioned whether existing tools like Ray and Dask could address similar needs, while Francisco raised concerns about the limitations of parallelizing functions that require specific parameters or environment dependencies. Ritesh clarified that while OpenGrease supports pure functions, it can also handle functions that read from local storage or S3, though state outside the function closure cannot be shared. + +### Task Distribution and Scheduling Challenges + +The team discussed challenges with task distribution and scheduling, particularly around efficiency and resource management in both on-premise and cloud environments. Ritesh explained their approach of delegating as much scheduling as possible to existing tools like Batch and Symphony, while implementing minimal custom solutions only where necessary. The discussion addressed questions about monitoring and task management, with Zhuo confirming that their system tracks task completion, memory usage, and duration, and can propagate logs and error messages to the client side when tasks fail. + +### Cloud Parallelization Challenges Discussion + +Ritesh explained that while current vectorization happens automatically through compilers, cloud parallelization remains expensive and difficult for most engineers to implement. He described the motivation behind part graph and part funder libraries as tools to make cloud parallelization more seamless and implicit, particularly addressing the skill gap in enterprises where many quant professionals lack knowledge of Kubernetes and parallel computing. Francisco acknowledged these challenges, sharing his experience working at JP Morgan where they built an internal cloud system for risk calculation, noting familiarity with the allocation and compute management issues Ritesh described. + +### Open-Source Projects Status Update + +The meeting focused on updates and discussions about open-source projects and initiatives. Ritesh provided information about their open-source work, including documentation and Quick Start guides available on GitHub. Alex mentioned plans to have John Bellamarek discuss DRA updates and Tim from Slurm/Nvidia talk about topology awareness in future meetings. Marlowe's recruitment efforts for benchmarking work were highlighted, with James and Filip's initiatives gaining momentum. The conversation ended with plans to continue these discussions in the next bi-weekly meeting. diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-05-05.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-05-05.md new file mode 100644 index 000000000..a15706eda --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-05-05.md @@ -0,0 +1,70 @@ +# 📅 May 5, 2026: DRA Round Deux + +📽️ [Recording](https://www.youtube.com/watch?v=G8vmRo9c1RQ) +🤖 [AI Summary](https://zoom-lfx.platform.linuxfoundation.org/meeting/99965231171-1777993200000/summaries?password=31828985-ca77-44a0-8af4-c112b0b153be) + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Abhishek Malvankar (Red Hat) [host] +- Marlow Warnicke (NVIDIA) [host] +- John Belamaric (Google) +- Mesut Oezdil (Open Source Contributor) +- Vivian Hafener (NVIDIA) + +## 📋 Agenda + +- Benchmarking +- Whitepapers +- DRA Part 2 w/ John Belamaric (Google) +- external scheduler integration, gang scheduling / pod groups in 1.36, atomic allocation challenges, and open questions from last session + +## 📝 Quick Recap + +The meeting focused on discussing Dynamic Resource Allocation (DRA) in Kubernetes, with John presenting an overview of recent enhancements to device modeling and scheduling APIs. John explained how DRA has expanded beyond simple string/int extended resources to include more descriptive device modeling, allowing for partial capacity consumption, partitionable devices like NVIDIA MIG, and cross-device constraints. The discussion covered technical details including sharing mechanisms for resources, scheduling constraints, and integration with existing systems like NRI and container runtimes. Participants raised questions about latency impacts, autoscaling integration, and the potential for using DRA resources for workloads beyond traditional devices. The group also discussed ongoing benchmarking efforts and white paper publications, with Marlow providing updates on a new benchmarking project involving multiple organizations. + +## ➡️ Next Steps + +- James: Share the Trello board link for the benchmarking project with the group. +- Marlow: Double-check with Tim about his interest and availability to present topology/Slurm topics in a future meeting. +- John: Look for and share the link or information about the DRA scheduler library (staging repo) for Alex and others interested in integrating DRA logic into their schedulers. +- John: Share the prototype DRA driver for model caching as a resource once it is publicly accessible (expected within the next few days). +- Flamur: Forward meeting invite for the DRA working group to Flamur (or provide logistics for joining future meetings). + +## 📋 Summary + +### OCP Hardware Software Co-Design Updates + +The group had an informal conversation covering various topics. Alex and Vivian discussed their current work, with Vivian mentioning her work on "linky stuff" and philosophical questions around AI usage. Alex shared observations about how architectural decisions in projects are often influenced by non-technical factors like politics and money. Victor announced the creation of a new Slack workspace for OCP Hardware Software Co-Design, which participants can join to follow ongoing activities and potentially participate when time permits. + +### Benchmarking and Collaboration Projects + +The meeting began with introductions, including Mesut, a new participant who introduced himself as a DevOps engineer and open source contributor. The group discussed two main ongoing projects: white papers that are being published and need final PR submissions, and a benchmarking project led by Marlow that aims to create a hardware-agnostic workload testing suite similar to LinPAC or HPCGS. Flamur from AWS suggested exploring potential collaboration between their work and the benchmarking project, particularly regarding Stack's HPC workload benchmarking approach. The meeting was cut short due to time constraints, with plans to continue discussions about DRA (likely Distributed Resource Allocation) in a future session. + +### Kubernetes Device Management Evolution + +John explained the evolution of device management in Kubernetes, highlighting the expansion of APIs to provide more descriptive and expressive modeling of devices, including support for partial capacity allocation and cross-device constraints. He discussed the integration of DRA with NRI for node-level preparation and actuation, emphasizing the benefits of combining these approaches. Flamur asked about slicing non-partitionable resources, to which John responded that current APIs do not support this directly, but there are efforts to improve scheduling and admission control to handle sharing more effectively. + +### NVIDIA MIG Kubernetes Implementation + +John explained how NVIDIA MIG works with the Kubernetes model using the NVIDIA DRA driver's Dynamic MIG feature, which leverages partitionable devices through the resource slice modeling API. He described how this allows for flexible device allocation where partitions can be combined or split across memory slots, with the scheduler managing resource allocation atomically to prevent overcommitment. John noted that this approach enables smoother transitions between GPU generations and helps with resource scarcity by allowing more flexibility in resource claims. + +### Unified API for Custom Resources + +Flamur discussed the need for a unified API to manage custom resource definitions beyond standard request limits, including CPU time slots, memory splits, and advanced scheduling considerations like NUMA node placement and network interface constraints. John explained that their current DRA implementation already supports publishing CPUs as DRA devices and includes alpha features for managing constraints like NUMA nodes and PCIe bridges, with GPU and NIC support already implemented and CPU support in development. John invited Flamur to participate in future meetings to provide requirements and ensure the implementation meets his needs. + +### Kubernetes Resource Constraints Implementation + +The team discussed the implementation of resource constraints and scheduling policies in Kubernetes, particularly focusing on how the Kubelet handles enforcement and dynamic resource allocation (DRA). John explained that while traditional topology management allows for "prefer" or "enforce" configurations, the DRA mechanism ensures pods only get scheduled to nodes where resource constraints can be met, with enforcement handled by the Kubelet and NRI plugin. The discussion also covered MIG (Multi-Instance GPU) capabilities, with John noting that while H100 GPUs and later support fully dynamic reconfiguration without requiring workload evacuation, A100 GPUs still require a reset when transitioning from non-MIG to MIG mode. + +### GPU Partitioning and Resource Management + +John and Alex discussed improvements to GPU partitioning and resource management. John explained that the new system allows for more dynamic configuration compared to previous static approaches used in GKE, where node pools were configured at creation time. They discussed the need to enhance work with DRA (Dynamic Resource Allocation) and the complexity involved in handling resource claims with optionality, including cross-device constraints and prioritized lists. John advised simplifying resource claims by making choices for users rather than passing on optionality to the scheduler to avoid complexity in quota management. + +### Scheduler Framework API Design Discussion + +The team discussed the scheduler framework and API design, with John explaining how they moved away from outsourcing scheduling logic to third-party controllers in earlier versions (1.26-1.27) to create a more expressive modeling API in 1.30-1.32. Marlow raised a question about splitting the system into individual and grouped modes, specifically about ensuring memory can still be extracted only from the grouped mode while allowing individual cores. Abhishek asked about latency and churn impacts for inference workloads, though the transcript ended before he received a full answer. + +### DRA Integration with Autoscalers + +John discussed the integration of DRA (Device Resource Abstraction) with autoscalers, noting that while there are potential edge cases and optimization opportunities, no significant issues have been observed so far. He also shared a prototype that allows publishing models as DRA resources, which could help optimize model loading and reduce startup times. The group discussed feedback mechanisms and logistics for accessing the working group information, with John mentioning that some links are still private but will be published soon. Flamur suggested that DRA resources could potentially be applied to other workloads beyond inference, and John agreed, highlighting the generic nature of the APIs and ongoing work in this area. diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-05-19.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-05-19.md new file mode 100644 index 000000000..8b3832361 --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-05-19.md @@ -0,0 +1,63 @@ +# 📅 May 19, 2026: Topology-Awareness in Slurm Redux + +📽️ [Recording](https://www.youtube.com/watch?v=ZJHHmGAVBBM) +🤖 [AI Summary](https://zoom-lfx.platform.linuxfoundation.org/meeting/99965231171-1779202800000/summaries?password=4db89e74-9e2a-43f7-b955-2e97147fb64e) + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Abhishek Malvankar (Red Hat) [host] +- Marlow Warnicke (NVIDIA) [host] +- Pavan Madduri (W.W. Grainger / CNCF Golden Kubestronaut) +- Tim Wickberg (SchedMD) +- Abhigyan Mohanta +- Diego Ciangottini (INFN) +- Filip Novovic (ISC) +- James Bhattarai +- Tommy Aldo Sonin (ISC) +- Victor Lu +- Vivian Hafener (SchedMD/NVIDIA) +- Jonathan Skone (LBNL/NERSC) +- Mesut Oezdil (Open Source Contributor) +- Nathan Rini (SchedMD/NVIDIA) + +## 📋 Agenda + +- 👋 Welcome & Introductions ("Hello, why are you here?") +- Topology-Aware Scheduling for Slurm (Tim Wickburg, SchedMD / NVIDIA) + +## 📝 Quick Recap + +This meeting was a follow-up discussion on batch scheduling and topology awareness, led by Alex with Tim presenting detailed technical insights on Slurm's backfill scheduling and topology block plugin. Tim explained how Slurm handles multi-node workloads with specific focus on NVIDIA's NVLink interconnect topology, demonstrating through diagrams how the scheduler plans resource allocation across different network blocks to optimize performance while maintaining system integrity. The discussion covered how backfill scheduling works at the node level, the challenges of managing multi-rack configurations, and the trade-offs between topology optimization and cluster utilization. Tim emphasized that workloads not requiring topology awareness should be run on different hardware to avoid fragmenting the specialized equipment needed for topology-optimized work. The conversation also touched on practical considerations like hardware reliability, job restarts, and the challenges of coordinating multiple schedulers in Kubernetes environments. + +## ➡️ Next Steps + +- Filip: Continue coordinating the benchmarking group (including NVIDIA, University in Nepal, and others) on assigned benchmarking tasks, update progress in the shared Git repository, and manage tasks in Trello. +- Alex: Submit or coordinate submission of a talk/panel proposal for KubeCon about the white papers, with panelists including Alex, Abhi, Marlowe, Sabrina, and others who contributed to the paper. +- All benchmarking group members: Continue contributing to the shared Git repository with benchmarking work and review each other's contributions as progress is made. + +## 📋 Summary + +### White Papers and KubeCon Planning + +The team met to discuss updates on white papers and upcoming events. Alex announced that a PR for five previously written white papers is currently in the process of being published. The group also plans to submit a paper or talk for KubeCon, featuring a panel discussion with Alex, Abhishek, Marlowe, and others about the white papers and batch landscape. + +### Benchmarking Collaboration Update + +Filip provided an update on benchmarking efforts, explaining that a small group has formed on Slack with members from NVIDIA and a university in Nepal, working on assigned tasks in Trello and using Git for collaboration. Pavan was introduced as a new participant working on Volcano and Dragonfly projects. The conversation ended with plans to return to discussing topology, with Tim requested to continue the conversation. + +### Slurm Topology and Backfill Scheduling + +Tim presented on Slurm's topology awareness and backfill scheduling capabilities, focusing on how HPC batch schedulers handle multi-node workloads and network topology constraints compared to Kubernetes. The discussion centered on the differences between Slurm's approach to multi-node scheduling versus Kubernetes' pod-based model, particularly around resource accounting and topology optimization. Tim explained how backfill scheduling works by planning resource usage through time to fit lower priority workloads into available gaps without impacting longer-term cluster plans, using a simplified example of a 16-node system with specific job durations. + +### Slurm Time Discretization Implementation + +Tim explained his work on implementing time discretization in Slurm's backfill planning system, which rounds time slots to a specified granularity (typically 5 minutes in production) to reduce modeling complexity. He demonstrated how Slurm's topology block plugin affects NVLink interconnects and node allocation, showing through examples how job scheduling is constrained by node placement requirements across different switches. Tim illustrated how these constraints can lead to suboptimal resource utilization, particularly when jobs cannot be scheduled due to topology restrictions, and showed how reoptimizing the node layout could potentially free up resources for other pending jobs. + +### Topology-Aware Scheduling in Slurm + +Tim explained that topology-aware scheduling in Slurm attempts to optimize job placement across fewer blocks of nodes to reduce cluster fragmentation, unlike other implementations that might spread workloads more broadly. He emphasized that all workloads should go through the same topology optimization process, as allowing non-topology-aware work to run on specialized equipment can prioritize less desirable workloads and degrade service levels for those that truly need the network capabilities. + +### Slurm Topology-Aware Scheduling Features + +Tim explained Slurm's topology-aware scheduling features, particularly for GPU workloads with NVLink interconnects, where optimized layouts can provide up to 40% performance improvements. He described how the system manages multi-node workloads through topology blocks and planning mechanisms, while maintaining resiliency through planning block sizes that account for potential node failures. Tim clarified that while backfill scheduling creates temporary maps for resource allocation, these maps are discarded after each scheduling cycle rather than being persisted, as rebuilding is computationally more efficient than modifying existing plans. diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-06-02.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-06-02.md new file mode 100644 index 000000000..9f373ad12 --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/2026-06-02.md @@ -0,0 +1,13 @@ +# 📅 Jun 2, 2026 + +## 👥 Attendees + +- Alex Scammon (G-Research) [host] +- Abhishek Malvankar (Red Hat) [host] +- Marlow Warnicke (NVIDIA) [host] +- Mesut Oezdil (Open Source Contributor) + +## 📋 Agenda + +- 👋 Welcome & Introductions ("Hello, why are you here?") +- Kueue + Slurm? diff --git a/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/README.md b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/README.md new file mode 100644 index 000000000..45b095676 --- /dev/null +++ b/tags/tag-workloads-foundation/subprojects/batch/meeting-notes/README.md @@ -0,0 +1,80 @@ +# Batch Subproject — Meeting Notes + +Meeting notes for the [CNCF Batch Subproject](https://tag-workloads-foundation.cncf.io/batch/). + +## Resources + +- **Google Doc (full notes):** https://docs.google.com/document/d/1GuZGyBkRGG0lEeiPA8q0PfvFlwUlwa5k-ZfXafCTdBY/edit +- **Charter:** [charter.md](../charter.md) +- **YouTube playlist:** https://www.youtube.com/playlist?list=PLlo2EEMTvVU-jMMA208R-cSEcVkmPYjxZ +- **LFX meeting page:** https://lfx.linuxfoundation.org/tools/open-source-summit/project-management/meetings +- **Zoom:** https://zoom-lfx.platform.linuxfoundation.org/meeting/99965231171?password=2a169dd5-e375-4b5a-9b40-b2b5db5bfe91 + +## Meetings + +- [📅 Jun 2, 2026](./2026-06-02.md) +- [📅 May 19, 2026: Topology-Awareness in Slurm Redux](./2026-05-19.md) +- [📅 May 5, 2026: DRA Round Deux](./2026-05-05.md) +- [📅 Apr 21, 2026: OpenGRIS](./2026-04-21.md) +- [📅 Apr 7, 2026: Workload-aware Scheduling](./2026-04-07.md) +- [📅 Mar 10, 2026: DRA Update](./2026-03-10.md) +- [📅 Feb 24, 2026: AI Scheduling Whitepaper Review](./2026-02-24.md) +- [📅 Feb 10, 2026: DRANet and the MPI Operator](./2026-02-10.md) +- [📅 Jan 27, 2026: New Year’s Plans](./2026-01-27.md) +- [📅 Jan 13, 2026: Networking! (Postponed)](./2026-01-13.md) +- [📅 Dec 2, 2025: Slurm Topology Awareness](./2025-12-02.md) +- [📅 Nov 18, 2025](./2025-11-18.md) +- [📅 Oct 21, 2025](./2025-10-21.md) +- [📅 Oct 7, 2025](./2025-10-07.md) +- [📅 Sep 23, 2025](./2025-09-23.md) +- [📅 Sep 9, 2025](./2025-09-09.md) +- [📅 Aug 26, 2025](./2025-08-26.md) +- [📅 Aug 12, 2025](./2025-08-12.md) +- [📅 Jul 29, 2025](./2025-07-29.md) +- [📅 Jul 15, 2025](./2025-07-15.md) +- [📅 Jul 1, 2025](./2025-07-01.md) +- [📅 Jun 17, 2025](./2025-06-17.md) +- [📅 Jun 3, 2025](./2025-06-03.md) +- [📅 May 20, 2025](./2025-05-20.md) +- [📅 May 6, 2025](./2025-05-06.md) +- [📅 Apr 22, 2025](./2025-04-22.md) +- [📅 Apr 8, 2025](./2025-04-08.md) +- [📅 Mar 25, 2025](./2025-03-25.md) +- [📅 Mar 11, 2025](./2025-03-11.md) +- [📅 Feb 25, 2025](./2025-02-25.md) +- [📅 Feb 11, 2025](./2025-02-11.md) +- [📅 Jan 27, 2025](./2025-01-27.md) +- [📅 Jan 13, 2025](./2025-01-13.md) +- [📅 Dec 16, 2024](./2024-12-16.md) +- [📅 Dec 2, 2024](./2024-12-02.md) +- [📅 Nov 18, 2024](./2024-11-18.md) +- [📅 Jul 1, 2024](./2024-07-01.md) +- [📅 Oct 9, 2023](./2023-10-09.md) +- [📅 Sep 11, 2023](./2023-09-11.md) +- [📅 May 22, 2023](./2023-05-22.md) +- [📅 May 8, 2023](./2023-05-08.md) +- [📅 Apr 24, 2023](./2023-04-24.md) +- [📅 Apr 10, 2023](./2023-04-10.md) +- [📅 Mar 27, 2023](./2023-03-27.md) +- [📅 Mar 13, 2023](./2023-03-13.md) +- [📅 Feb 27, 2023](./2023-02-27.md) +- [📅 Feb 13, 2023](./2023-02-13.md) +- [📅 Jan 30, 2023](./2023-01-30.md) +- [📅 Dec 19, 2022](./2022-12-19.md) +- [📅 Nov 21, 2022](./2022-11-21.md) +- [📅 Nov 7, 2022](./2022-11-07.md) +- [📅 Oct 10, 2022](./2022-10-10.md) +- [📅 Sep 26, 2022](./2022-09-26.md) +- [📅 Aug 15, 2022](./2022-08-15.md) +- [📅 Aug 1, 2022](./2022-08-01.md) +- [📅 Jul 18, 2022](./2022-07-18.md) +- [📅 Jun 20, 2022](./2022-06-20.md) +- [📅 Jun 6, 2022](./2022-06-06.md) + +## Cadence + +Meetings are held **every other Tuesday at 8:00 AM PDT** (biweekly). + +## File naming + +Each file in this directory is named `YYYY-MM-DD.md` corresponding to the meeting date.