AI workload differs significantly from conventional cloud workload (e.g., big data OLAP or OLTP workload). In early 2017, I began to look into this problem and tried to understand the implication. With my colleagues, we investigated a massive amount of AI workloads in Philly, Microsoft’s early GPU cluster management system designed for deep learning training. We shared our findings in (Jeon et al., 2019). We explained our thoughts on the scheduling primitives for training jobs (Xiao et al., 2018), and emphasized on the importance of topology aware scheduling (Zhao et al., 2020) in the AI era. Meanwhile, we also discovered several interesting opportunities in the coexistence of gaming and training workloads (Zhang et al., 2022), codesign of caching and scheduling (Zhao et al., 2023), and elastic training (Gu et al., 2023).
Given the strategic importance of GPU cluster management, I led an engineering group to develop OpenPAI, a Kubernetes based open-source cluster management platform for deep learning training and inferencing. OpenPAI is one of the earliest k8s systems capable of managing GPU clusters. It integrated several techniques mentioned above. Its key components like framework controller have been adopted by Azure AI products. As far as I know, several external organizations also developed their training infrastructure based on OpenPAI.
@inproceedings{DBLP:conf/eurosys/ZhaoHYZ0YZL0QZZ23,title={SiloD: {A} Co-design of Caching and Scheduling for Deep Learning Clusters},author={Zhao, Hanyu and Han, Zhenhua and Yang, Zhi and Zhang, Quanlu and Li, Mingxia and Yang, Fan and Zhang, Qianxi and Li, Binyang and Yang, Yuqing and Qiu, Lili and Zhang, Lintao and Zhou, Lidong},year={2023},booktitle={Proceedings of the Eighteenth European Conference on Computer Systems, {EuroSys}},}
ElasticFlow: An Elastic Serverless Training Platform for Distributed Deep Learning
Diandian Gu, and 9 more authors
In Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2, ASPLOS, 2023
@inproceedings{DBLP:conf/asplos/GuZZXHCYHJL23,title={ElasticFlow: An Elastic Serverless Training Platform for Distributed Deep Learning},author={Gu, Diandian and Zhao, Yihao and Zhong, Yinmin and Xiong, Yifan and Han, Zhenhua and Cheng, Peng and Yang, Fan and Huang, Gang and Jin, Xin and Liu, Xuanzhe},year={2023},booktitle={Proceedings of the 28th {ACM} International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2, {ASPLOS}},}
@inproceedings{DBLP:conf/usenix/0149CH000SYG22,title={PilotFish: Harvesting Free Cycles of Cloud Gaming with Deep Learning Training},author={Zhang, Wei and Chen, Binghao and Han, Zhenhua and Chen, Quan and Cheng, Peng and Yang, Fan and Shu, Ran and Yang, Yuqing and Guo, Minyi},year={2022},booktitle={2022 {USENIX} Annual Technical Conference, {USENIX} {ATC}},}
@inproceedings{DBLP:conf/osdi/ZhaoHYZYZYLWXW20,title={Hive{D}: Sharing a {GPU} Cluster for Deep Learning with Guarantees},author={Zhao, Hanyu and Han, Zhenhua and Yang, Zhi and Zhang, Quanlu and Yang, Fan and Zhou, Lidong and Yang, Mao and Lau, Francis C. M. and Wang, Yuqi and Xiong, Yifan and Wang, Bin},year={2020},booktitle={14th {USENIX} Symposium on Operating Systems Design and Implementation, {OSDI}},}
@inproceedings{DBLP:conf/usenix/JeonVPQXY19,title={Analysis of Large-Scale Multi-Tenant {GPU} Clusters for {DNN} Training Workloads},author={Jeon, Myeongjae and Venkataraman, Shivaram and Phanishayee, Amar and Qian, Junjie and Xiao, Wencong and Yang, Fan},year={2019},booktitle={2019 {USENIX} Annual Technical Conference, {USENIX} {ATC}},}
@inproceedings{DBLP:conf/osdi/XiaoBRSKHPPZZYZ18,title={Gandiva: Introspective Cluster Scheduling for Deep Learning},author={Xiao, Wencong and Bhardwaj, Romil and Ramjee, Ramachandran and Sivathanu, Muthian and Kwatra, Nipun and Han, Zhenhua and Patel, Pratyush and Peng, Xuan and Zhao, Hanyu and Zhang, Quanlu and Yang, Fan and Zhou, Lidong},year={2018},booktitle={13th {USENIX} Symposium on Operating Systems Design and Implementation, {OSDI}},}