publications | Fan Yang

Please find the complete list here.

2026

Sparse Attention Adaptation for Long Reasoning

Yizhao Gao, and 14 more authors

In International Conference on Learning Representations, ICLR, 2026

@inproceedings{SeerAttentionR26,
  title = {Sparse Attention Adaptation for Long Reasoning},
  author = {Gao, Yizhao and Guo, Shuming and Cao, Shijie and Xia, Yuqing and Cheng, Yu and Wang, Lei and Ma, Lingxiao and Sun, Yutao and Ye, Tianzhu and Dong, Li and So, Hayden and Hua, Yu and Cao, Ting and Yang, Fan and Yang, Mao},
  year = {2026},
  booktitle = {International Conference on Learning Representations, {ICLR}},
}

ICLR

TileLang: Bridge Programmability and Performance in Modern Neural Kernels

Lei Wang, and 10 more authors

In International Conference on Learning Representations, ICLR, 2026

Oral Bib HTML Code

ICLR’26 oral paper.

@inproceedings{tilelang26,
  title = {TileLang: Bridge Programmability and Performance in Modern Neural Kernels},
  author = {Wang, Lei and Cheng, Yu and Shi, Yining and Tang, Zhengju and Mo, Zhiwen and Xie, Wenhao and Ma, Lingxiao and Xia, Yuqing and Xue, Jilong and Yang, Fan and Yang, Zhi},
  year = {2026},
  booktitle = {International Conference on Learning Representations, {ICLR}},
}

ICLR

LoongRL: Reinforcement Learning for Advanced Reasoning over Long Contexts

Siyuan Wang, and 6 more authors

In International Conference on Learning Representations, ICLR, 2026

Oral Bib HTML Code

ICLR’26 oral paper.

@inproceedings{loongRL26,
  title = {LoongRL: Reinforcement Learning for Advanced Reasoning over Long Contexts},
  author = {Wang, Siyuan and Zhang, Gaokai and Zhang, Li Lyna and Shang, Ning and Yang, Fan and Chen, Dongyao and Yang, Mao},
  year = {2026},
  booktitle = {International Conference on Learning Representations, {ICLR}},
}

PPoPP

MetaAttention: A Unified and Performant Attention Framework across Hardware Backends

Feiyang Chen, and 11 more authors

In Proceedings of the 31st ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming, 2026

Bib HTML

@inproceedings{10.1145/3774934.3786444,
  author = {Chen, Feiyang and Cheng, Yu and Wang, Lei and Xia, Yuqing and Miao, Ziming and Ma, Lingxiao and Yang, Fan and Xue, Jilong and Yang, Zhi and Yang, Mao and Wei, Xingda and Chen, Haibo},
  title = {MetaAttention: A Unified and Performant Attention Framework across Hardware Backends},
  year = {2026},
  booktitle = {Proceedings of the 31st ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming}
}

2025

ArXiv

Vibe Reasoning: Eliciting Frontier AI Mathematical Capabilities – A Case Study on IMO 2025 Problem 6

Jiaao Wu, and 3 more authors

ArXiv, 2025

Bib HTML

@article{vibereasoning25,
  title = {Vibe Reasoning: Eliciting Frontier AI Mathematical Capabilities -- A Case Study on IMO 2025 Problem 6},
  author = {Wu, Jiaao and Zhang, Xian and Yang, Fan and Dong, Yinpeng},
  year = {2025},
  journal = {ArXiv},
}

NeurIPS

Reviving DSP for Advanced Theorem Proving in the Era of Reasoning Models

Chenrui Cao, and 6 more authors

In Advances in Neural Information Processing Systems, NeurIPS, 2025

Bib HTML

@inproceedings{dspplus25,
  title = {Reviving DSP for Advanced Theorem Proving in the Era of Reasoning Models},
  author = {Cao, Chenrui and Song, Liangcheng and Li, Zenan and Le, Xinyi and Zhang, Xian and Xue, Hui and Yang, Fan},
  year = {2025},
  booktitle = {Advances in Neural Information Processing Systems, {NeurIPS}},
}

NeurIPS

rStar-Coder: Scaling Competitive Code Reasoning with a Large-Scale Verified Dataset

Yifei Liu, and 7 more authors

In Advances in Neural Information Processing Systems, NeurIPS, 2025

Bib HTML

@inproceedings{rstarcoder25,
  title = {rStar-Coder: Scaling Competitive Code Reasoning with a Large-Scale Verified Dataset},
  author = {Liu, Yifei and Zhang, Li Lyna and Zhu, Yi and Dong, Bingcheng and Zhou, Xudong and Shang, Ning and Yang, Fan and Yang, Mao},
  year = {2025},
  booktitle = {Advances in Neural Information Processing Systems, {NeurIPS}},
}

NeurIPS

RetrievalAttention: Accelerating Long-Context LLM Inference via Vector Retrieval

Di Liu, and 13 more authors

In Advances in Neural Information Processing Systems, NeurIPS, 2025

ENLSP’24 Best Paper Bib HTML

An early version has won the best paper award of the NeurIPS Efficient Natural Language and Speech Processing (ENLSP-IV) workshop 2024.

@inproceedings{retrievalattention2025,
  title = {RetrievalAttention: Accelerating Long-Context LLM Inference via Vector Retrieval},
  author = {Liu, Di and Chen, Meng and Lu, Baotong and Jiang, Huiqiang and Han, Zhenhua and Zhang, Qianxi and Chen, Qi and Zhang, Chengruidong and Ding, Bailu and Zhang, Kai and Chen, Chen and Yang, Fan and Yang, Yuqing and Qiu, Lili},
  year = {2025},
  booktitle = {Advances in Neural Information Processing Systems, {NeurIPS}},
}

NeurIPS

SeerAttention: Learning Intrinsic Sparse Attention in Your LLMs

Yizhao Gao, and 7 more authors

In Advances in Neural Information Processing Systems, NeurIPS, 2025

Bib HTML

@inproceedings{seerattention25,
  title = {SeerAttention: Learning Intrinsic Sparse Attention in Your LLMs},
  author = {Gao, Yizhao and Zeng, Zhichen and Du, Dayou and Cao, Shijie and So, Hayden Kwok-Hay and Cao, Ting and Yang, Fan and Yang, Mao},
  year = {2025},
  booktitle = {Advances in Neural Information Processing Systems, {NeurIPS}},
}

SOSP

TrainVerify: Equivalence-Based Verification for Distributed LLM Training

Yunchi Lu, and 6 more authors

In SOSP. ArXiv version , 2025

Bib HTML Code

@inproceedings{trainverify25,
  title = {TrainVerify: Equivalence-Based Verification for Distributed LLM Training},
  author = {Lu, Yunchi and Miao, Youshan and Tan, Cheng and Huang, Peng and Zhu, Yi and Zhang, Xian and Yang, Fan},
  year = {2025},
  booktitle = {{SOSP}},
}

ArXiv

rStar2-Agent: Agentic Reasoning Technical Report

Ning Shang, and 14 more authors

ArXiv, 2025

Hugging Face Daily Papers Bib HTML

Selected as Hugging Face daily papers: #1 paper of the day (2025-08-29).

@article{rstar2agent25,
  title = {rStar2-Agent: Agentic Reasoning Technical Report},
  author = {Shang, Ning and Liu, Yifei and Zhu, Yi and Zhang, Li Lyna and Xu, Weijiang and Guan, Xinyu and Zhang, Buze and Dong, Bingcheng and Zhou, Xudong and Zhang, Bowen and Xin, Ying and Miao, Ziming and Li, Scarlett and Yang, Fan and Yang, Mao},
  year = {2025},
  journal = {ArXiv},
}

ISCA

LUT Tensor Core: A Software-Hardware Co-Design for LUT-Based Low-Bit LLM Inference

Zhiwen Mo, and 10 more authors

In Proceedings of the 52nd Annual International Symposium on Computer Architecture (ISCA), 2025

Bib HTML Slides

@inproceedings{lutcore25,
  author = {Mo, Zhiwen and Wang, Lei and Wei, Jianyu and Zeng, Zhichen and Cao, Shijie and Ma, Lingxiao and Jing, Naifeng and Cao, Ting and Xue, Jilong and Yang, Fan and Yang, Mao},
  title = {LUT Tensor Core: A Software-Hardware Co-Design for LUT-Based Low-Bit LLM Inference},
  year = {2025},
  booktitle = {Proceedings of the 52nd Annual International Symposium on Computer Architecture (ISCA)},
}

OSDI

WaferLLM: A Wafer-Scale LLM Inference System

Congjie He, and 7 more authors

In 19th USENIX Symposium on Operating Systems Design and Implementation, OSDI. An introductory article at ;login: , 2025

Bib HTML Slides

@inproceedings{waferllm25,
  title = {WaferLLM: A Wafer-Scale LLM Inference System},
  author = {He, Congjie and Huang, Yeqi and Mu, Pei and Miao, Ziming and Xue, Jilong and Ma, Lingxiao and Yang, Fan and Mai, Luo},
  year = {2025},
  booktitle = {19th {USENIX} Symposium on Operating Systems Design and Implementation, {OSDI}},
}

OSDI

PipeThreader: Software-Defined Pipelining for Efficient DNN Execution

Yu Cheng, and 11 more authors

In 19th USENIX Symposium on Operating Systems Design and Implementation, OSDI, 2025

Bib HTML Code Slides

@inproceedings{pipethreader25,
  title = {PipeThreader: Software-Defined Pipelining for Efficient DNN Execution},
  author = {Cheng, Yu and Wang, Lei and Shi, Yining and Xia, Yuqing and Ma, Lingxiao and Xue, Jilong and Wang, Yang and Mo, Zhiwen and Chen, Feiyang and Yang, Fan and Yang, Mao and Yang, Zhi},
  year = {2025},
  booktitle = {19th {USENIX} Symposium on Operating Systems Design and Implementation, {OSDI}},
}

ArXiv

TileLang: A Composable Tiled Programming Model for AI Systems

Lei Wang, and 10 more authors

ArXiv. (A revised version appears in ICLR’26) , 2025

Bib HTML Code

@article{tilelang25,
  title = {TileLang: A Composable Tiled Programming Model for AI Systems},
  author = {Wang, Lei and Cheng, Yu and Shi, Yining and Tang, Zhengju and Mo, Zhiwen and Xie, Wenhao and Ma, Lingxiao and Xia, Yuqing and Xue, Jilong and Yang, Fan and Yang, Zhi},
  year = {2025},
  journal = {ArXiv},
}

ArXiv

RetroInfer: A Vector-Storage Approach for Scalable Long-Context LLM Inference

Yaoqi Chen, and 17 more authors

ArXiv, 2025

Bib HTML

@article{chen2025retroinfervectorstorageapproachscalable,
  title = {RetroInfer: A Vector-Storage Approach for Scalable Long-Context LLM Inference},
  author = {Chen, Yaoqi and Zhang, Jinkai and Lu, Baotong and Zhang, Qianxi and Zhang, Chengruidong and Luo, Jingjia and Liu, Di and Jiang, Huiqiang and Chen, Qi and Liu, Jing and Ding, Bailu and Yan, Xiao and Jiang, Jiawei and Chen, Chen and Zhang, Mingxing and Yang, Yuqing and Yang, Fan and Yang, Mao},
  year = {2025},
  journal = {ArXiv},
}

ICML

rStar-Math: Small LLMs Can Master Math Reasoning with Self-Evolved Deep Thinking

Xinyu Guan, and 7 more authors

In ICML 2025, 2025

Oral Bib HTML

ICML Oral paper (120 of 3260 accepted papers). Selected as Hugging Face daily papers: #1 paper of the day (2025-01-09).

@inproceedings{rstarmaths25,
  title = {rStar-Math: Small LLMs Can Master Math Reasoning with Self-Evolved Deep Thinking},
  author = {Guan, Xinyu and Zhang, Li Lyna and Liu, Yifei and Shang, Ning and Sun, Youran and Zhu, Yi and Yang, Fan and Yang, Mao},
  year = {2025},
  booktitle = {ICML 2025},
}

ICML

LongRoPE2: Near-Lossless LLM Context Window Scaling

Ning Shang, and 7 more authors

In ICML 2025, 2025

Bib HTML

@inproceedings{shang2025longrope2,
  title = {LongRoPE2: Near-Lossless LLM Context Window Scaling},
  author = {Shang, Ning and Zhang, Li Lyna and Wang, Siyuan and Zhang, Gaokai and Lopez, Gilsinia and Yang, Fan and Chen, Weizhu and Yang, Mao},
  year = {2025},
  booktitle = {ICML 2025},
}

ICLR

Automated Proof Generation for Rust Code via Self-Evolution

Tianyu Chen, and 13 more authors

In International Conference on Learning Representations, ICLR, 2025

Bib HTML

@inproceedings{chen2025automatedproofgenerationrust,
  title = {Automated Proof Generation for Rust Code via Self-Evolution},
  author = {Chen, Tianyu and Lu, Shuai and Lu, Shan and Gong, Yeyun and Yang, Chenyuan and Li, Xuheng and Misu, Md Rakib Hossain and Yu, Hao and Duan, Nan and Cheng, Peng and Yang, Fan and Lahiri, Shuvendu K and Xie, Tao and Zhou, Lidong},
  year = {2025},
  booktitle = {International Conference on Learning Representations, {ICLR}},
}

ICLR

Mutual Reasoning Makes Smaller LLMs Stronger Problem-Solvers

Zhenting Qi, and 5 more authors

In International Conference on Learning Representations, ICLR, 2025

Hugging Face Daily Papers Bib HTML Code

Selected as Hugging Face daily papers: #2 paper of the day (2024-08-13).

@inproceedings{qi2025mutualreasoningmakessmaller,
  title = {Mutual Reasoning Makes Smaller LLMs Stronger Problem-Solvers},
  author = {Qi, Zhenting and Ma, Mingyuan and Xu, Jiahang and Zhang, Li Lyna and Yang, Fan and Yang, Mao},
  year = {2025},
  booktitle = {International Conference on Learning Representations, {ICLR}},
}

ICLR

Proving Olympiad Inequalities by Synergizing LLMs and Symbolic Reasoning

Zenan Li, and 8 more authors

In International Conference on Learning Representations, ICLR, 2025

Bib HTML

@inproceedings{inequalityiclr25,
  title = {Proving Olympiad Inequalities by Synergizing LLMs and Symbolic Reasoning},
  author = {Li, Zenan and Li, Zhaoyu and Tang, Wen and Zhang, Xian and Yao, Yuan and Si, Xujie and Yang, Fan and Yang, Kaiyu and Ma, Xiaoxing},
  year = {2025},
  booktitle = {International Conference on Learning Representations, {ICLR}},
}

HPCA

LUT-DLA: Lookup Table as Efficient Extreme Low-Bit Deep Learning Accelerator

Guoyu Li, and 8 more authors

In 31st International Symposium on High-Performance Computer Architecture, HPCA, 2025

Bib HTML

@inproceedings{lutdla25,
  title = {LUT-DLA: Lookup Table as Efficient Extreme Low-Bit Deep Learning Accelerator},
  author = {Li, Guoyu and Ye, Shengyu and Chen, Chunyun and Wang, Yang and Yang, Fan and Cao, Ting and Liu, Cheng and Sabry, Mohamed M and Yang, Mao},
  booktitle = {31st International Symposium on High-Performance Computer Architecture, {HPCA}},
  year = {2025}
}

OOPSLA

AutoVerus: Automated Proof Generation for Rust Code

Chenyuan Yang, and 12 more authors

Proc. ACM Program. Lang.Site, the ArXiv version , Oct 2025

Distinguished Artifact Award Bib HTML

The OOPSLA 2025 Artifact Evaluation Committee has selected this work to receive a Distinguished Artifact Award.

@article{yang2025autoverus,
  author = {Yang, Chenyuan and Li, Xuheng and Misu, Md Rakib Hossain and Yao, Jianan and Cui, Weidong and Gong, Yeyun and Hawblitzel, Chris and Lahiri, Shuvendu and Lorch, Jacob R. and Lu, Shuai and Yang, Fan and Zhou, Ziqiao and Lu, Shan},
  title = {AutoVerus: Automated Proof Generation for Rust Code},
  year = {2025},
  volume = {9},
  journal = {Proc. ACM Program. Lang.},
  month = oct,
  articleno = {396},
}

2024

ArXiv

LUT Tensor Core: Lookup Table Enables Efficient Low-Bit LLM Inference Acceleration

Zhiwen Mo, and 10 more authors

ArXiv. (A revised version will appear in ISCA’25) , Oct 2024

Bib HTML

@article{molutcore2024,
  title = {LUT Tensor Core: Lookup Table Enables Efficient Low-Bit LLM Inference Acceleration},
  author = {Mo, Zhiwen and Wang, Lei and Wei, Jianyu and Zeng, Zhichen and Cao, Shijie and Ma, Lingxiao and Jing, Naifeng and Cao, Ting and Xue, Jilong and Yang, Fan and Yang, Mao},
  year = {2024},
  journal = {ArXiv},
}

NeurIPS

Autoformalize Mathematical Statements by Symbolic Equivalence and Semantic Consistency

Zenan Li, and 6 more authors

In Advances in Neural Information Processing Systems, NeurIPS, Oct 2024

Bib HTML

@inproceedings{NEURIPS2024_AutoFormalization,
  title = {Autoformalize Mathematical Statements by Symbolic Equivalence and Semantic Consistency},
  author = {Li, Zenan and Wu, Yifan and Li, Zhaoyu and Wei, Xinming and Zhang, Xian and Yang, Fan and Ma, Xiaoxing},
  year = {2024},
  booktitle = {Advances in Neural Information Processing Systems, {NeurIPS}},
}

NeurIPS

Neuro-Symbolic Data Generation for Math Reasoning

Zenan Li, and 7 more authors

In Advances in Neural Information Processing Systems, NeurIPS, Oct 2024

Bib HTML

@inproceedings{NEURIPS2024_NSMR,
  title = {Neuro-Symbolic Data Generation for Math Reasoning},
  author = {Li, Zenan and Zhou, Zhi and Yao, Yuan and Zhang, Xian and Li, Yu-Feng and Cao, Chun and Yang, Fan and Ma, Xiaoxing},
  year = {2024},
  booktitle = {Advances in Neural Information Processing Systems, {NeurIPS}},
}

EMNLP

Fewer is More: Boosting LLM Reasoning with Reinforced Context Pruning

Xijie Huang, and 4 more authors

In EMNLP (Main), Oct 2024

Bib HTML

@inproceedings{huang2024fewermoreboostingllm,
  title = {Fewer is More: Boosting LLM Reasoning with Reinforced Context Pruning},
  author = {Huang, Xijie and Zhang, Li Lyna and Cheng, Kwang-Ting and Yang, Fan and Yang, Mao},
  year = {2024},
  booktitle = {{EMNLP (Main)}},
}

SOSP
Uncovering Nested Data Parallelism and Data Reuse in DNN Computation with FractalTensor

Siran Liu, and 7 more authors

In SOSP, Oct 2024

Abs Bib HTML Slides

To speed up computation, deep neural networks (DNNs) usually rely on highly optimized tensor operators. Despite the effectiveness, tensor operators are often defined empirically with ad hoc semantics. This hinders the analysis and optimization across operator boundaries. FractalTensor is a programming framework that addresses this challenge. At the core, FractalTensor is a nested list-based abstract data type (ADT), where each element is a tensor with static shape or another FractalTensor (i.e., nested). DNNs are then defined by high-order compute operators like map/reduce/scan and data access operators like window/stride on FractalTensor. This new way of DNN definition explicitly exposes nested data parallelism and fine-grained data access patterns, opening new opportunities for whole program analysis and optimization. To exploit these opportunities, from the FractalTensor-based code the compiler extracts a nested multi-dimensional dataflow graph called Extended Task Dependence Graph (ETDG), which provides a holistic view of data dependency across different granularity. The ETDG is then transformed into an efficient implementation through graph coarsening, data reordering, and access materialization. Evaluation on six representative DNNs like RNN and FlashAttention on NVIDIA A100 shows that FractalTensor achieves speedup by up to 5.44x and 1.97x on average through a unified solution for diverse optimizations.
@inproceedings{FractalTensorSosp24, title = {Uncovering Nested Data Parallelism and Data Reuse in DNN Computation with FractalTensor}, author = {Liu, Siran and Qi, Chengxiang and Cao, Ying and Yang, Chao and Hu, Weifang and Shi, Xuanhua and Yang, Fan and Yang, Mao}, year = {2024}, booktitle = {{SOSP}}, }

ECCV

IRGen: Generative Modeling for Image Retrieval

Yidan Zhang, and 13 more authors

In ECCV, Oct 2024

Bib HTML

@inproceedings{zhang2024irgengenerativemodelingimage,
  title = {IRGen: Generative Modeling for Image Retrieval},
  author = {Zhang, Yidan and Zhang, Ting and Chen, Dong and Wang, Yujing and Chen, Qi and Xie, Xing and Sun, Hao and Deng, Weiwei and Zhang, Qi and Yang, Fan and Yang, Mao and Liao, Qingmin and Wang, Jingdong and Guo, Baining},
  year = {2024},
  booktitle = {ECCV},
}

WWW
OneSparse: A Unified System for Multi-index Vector Search

Yaoqi Chen, and 16 more authors

In Companion Proceedings of the ACM Web Conference 2024, Singapore, Singapore, Oct 2024

Abs Bib HTML

Multi-index vector search has become the cornerstone for many applications, such as recommendation systems. Efficient search in such a multi-modal hybrid vector space is challenging since no single index design performs well for all kinds of vector data. Existing approaches to processing multi-index hybrid queries either suffer from algorithmic limitations or processing inefficiency. In this paper, we propose OneSparse, a unified multi-vector index query system that incorporates multiple posting-based vector indices, which enables highly efficient retrieval of multi-modal data-sets. OneSparse introduces a novel multi-index query engine design of inter-index intersection push-down. It also optimizes the vector posting format to expedite multi-index queries. Our experiments show OneSparse achieves more than 6x search performance improvement while maintaining comparable accuracy. OneSparse has already been integrated into Microsoft online web search and advertising systems with 5x+ latency gain for Bing web search and 2.0% Revenue Per Mille (RPM) gain for Bing sponsored search.
@inproceedings{10.1145/3589335.3648338, title = {OneSparse: A Unified System for Multi-index Vector Search}, author = {Chen, Yaoqi and Zheng, Ruicheng and Chen, Qi and Xu, Shuotao and Zhang, Qianxi and Wu, Xue and Han, Weihao and Yuan, Hua and Li, Mingqin and Wang, Yujing and Li, Jason and Yang, Fan and Sun, Hao and Deng, Weiwei and Sun, Feng and Zhang, Qi and Yang, Mao}, year = {2024}, booktitle = {Companion Proceedings of the ACM Web Conference 2024}, location = {Singapore, Singapore}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, series = {WWW '24}, pages = {393–402}, doi = {10.1145/3589335.3648338}, isbn = {9798400701726}, numpages = {10}, keywords = {approximate nearest neighbor search, multi-index search, retrieval system, sparse and dense search} }
WWW
MS MARCO Web Search: A Large-scale Information-rich Web Dataset with Millions of Real Click Labels

Qi Chen, and 30 more authors

In Companion Proceedings of the ACM Web Conference 2024, Singapore, Singapore, Oct 2024

Abs Bib HTML

Recent breakthroughs in large models have highlighted the critical significance of data scale, labels and modals. In this paper, we introduce MS MARCO Web Search, the first large-scale information-rich web dataset, featuring millions of real clicked query-document labels. This dataset closely mimics real-world web document and query distribution, provides rich information for various kinds of downstream tasks and encourages research in various areas, such as generic end-to-end neural indexer models, generic embedding models, and next generation information access system with large language models. MS MARCO Web Search offers a retrieval benchmark with three web retrieval challenge tasks that demands innovations in both machine learning and information retrieval system research domains. As the first dataset that meets large, real and rich data requirements, MS MARCO Web Search paves the way for future advancements in AI and system research. MS MARCO Web Search dataset is available at: https://github.com/microsoft/MS-MARCO-Web-Search.
@inproceedings{10.1145/3589335.3648327, title = {MS MARCO Web Search: A Large-scale Information-rich Web Dataset with Millions of Real Click Labels}, author = {Chen, Qi and Geng, Xiubo and Rosset, Corby and Buractaon, Carolyn and Lu, Jingwen and Shen, Tao and Zhou, Kun and Xiong, Chenyan and Gong, Yeyun and Bennett, Paul and Craswell, Nick and Xie, Xing and Yang, Fan and Tower, Bryan and Rao, Nikhil and Dong, Anlei and Jiang, Wenqi and Liu, Zheng and Li, Mingqin and Liu, Chuanjie and Li, Zengzhong and Majumder, Rangan and Neville, Jennifer and Oakley, Andy and Risvik, Knut Magne and Simhadri, Harsha Vardhan and Varma, Manik and Wang, Yujing and Yang, Linjun and Yang, Mao and Zhang, Ce}, year = {2024}, booktitle = {Companion Proceedings of the ACM Web Conference 2024}, location = {Singapore, Singapore}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, series = {WWW '24}, pages = {292–301}, doi = {10.1145/3589335.3648327}, isbn = {9798400701726}, numpages = {10}, keywords = {dataset, information retrieval, web search} }
KDD
Understanding the Weakness of Large Language Model Agents within a Complex Android Environment

Mingzhe Xing, and 5 more authors

In Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, Barcelona, Spain, Oct 2024

Abs Bib HTML

Large language models (LLMs) have empowered intelligent agents to execute intricate tasks within domain-specific software such as browsers and games. However, when applied to general-purpose software systems like operating systems, LLM agents face three primary challenges. Firstly, the action space is vast and dynamic, posing difficulties for LLM agents to maintain an up-to-date understanding and deliver accurate responses. Secondly, real-world tasks often require inter-application cooperation, demanding farsighted planning from LLM agents. Thirdly, agents need to identify optimal solutions aligning with user constraints, such as security concerns and preferences. These challenges motivate AndroidArena, an environment and benchmark designed to evaluate LLM agents on a modern operating system. To address high-cost of manpower, we design a scalable and semi-automated method to construct the benchmark. In the task evaluation, AndroidArena incorporates accurate and adaptive metrics to address the issue of non-unique solutions. Our findings reveal that even state-of-the-art LLM agents struggle in cross-APP scenarios and adhering to specific constraints. Additionally, we identify a lack of four key capabilities, i.e. understanding, reasoning, exploration, and reflection, as primary reasons for the failure of LLM agents. Furthermore, we provide empirical analysis on the failure of reflection, and improve the success rate by 27% with our proposed exploration strategy. This work is the first to present valuable insights in understanding fine-grained weakness of LLM agents, and offers a path forward for future research in this area. Environment, benchmark, prompt, and evaluation code for AndroidArena are released at https://github.com/AndroidArenaAgent/AndroidArena.
@inproceedings{10.1145/3637528.3671650, title = {Understanding the Weakness of Large Language Model Agents within a Complex Android Environment}, author = {Xing, Mingzhe and Zhang, Rongkai and Xue, Hui and Chen, Qi and Yang, Fan and Xiao, Zhen}, year = {2024}, booktitle = {Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining}, location = {Barcelona, Spain}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, series = {KDD '24}, pages = {6061–6072}, doi = {10.1145/3637528.3671650}, isbn = {9798400704901}, numpages = {12}, keywords = {ai agent, large language model, task planning} }

ICML

LongRoPE: Extending LLM Context Window Beyond 2 Million Tokens

Yiran Ding, and 7 more authors

In ICML 2024, Oct 2024

Hugging Face Daily Papers Bib HTML Code Poster

Selected as Hugging Face daily papers: #1 paper of the day (2024-02-22).

@inproceedings{ding2024longrope,
  title = {LongRoPE: Extending LLM Context Window Beyond 2 Million Tokens},
  author = {Ding, Yiran and Zhang, Li Lyna and Zhang, Chengruidong and Xu, Yuanyuan and Shang, Ning and Xu, Jiahang and Yang, Fan and Yang, Mao},
  year = {2024},
  booktitle = {ICML 2024},
}

ArXiv

Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone

Marah Abdin, and 83 more authors

ArXiv. (Applying LongRoPE to Phi-3) , Oct 2024

Bib HTML Code

@article{Abdin2024Phi3TR,
  title = {Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone},
  author = {Abdin, Marah and Jacobs, Sam Ade and Awan, Ammar Ahmad and Aneja, Jyoti and Awadallah, Ahmed and Awadalla, Hany Hassan and Bach, Nguyen and Bahree, Amit and Bakhtiari, Arash and Behl, Harkirat Singh and Benhaim, Alon and Bilenko, Misha and Bjorck, Johan and Bubeck, S{\'e}bastien and Cai, Martin and Mendes, Caio C'esar Teodoro and Chen, Weizhu and Chaudhary, Vishrav and Chopra, Parul and Giorno, Allison Del and de Rosa, Gustavo and Dixon, Matthew and Eldan, Ronen and Iter, Dan and Goswami, Abhishek and Gunasekar, Suriya and Haider, Emman and Hao, Junheng and Hewett, Russell J. and Huynh, Jamie and Javaheripi, Mojan and Jin, Xin and Kauffmann, Piero and Karampatziakis, Nikos and Kim, Dongwoo and Khademi, Mahoud and Kurilenko, Lev and Lee, James R. and Lee, Yin Tat and Li, Yuanzhi and Liang, Chen and Liu, Weishung and Lin, Eric and Lin, Zeqi and Madan, Piyush and Mitra, Arindam and Modi, Hardik and Nguyen, Anh and Norick, Brandon and Patra, Barun and Perez-Becker, Daniel and Portet, Thomas and Pryzant, Reid and Qin, Heyang and Radmilac, Marko and Rosset, Corby and Roy, Sambudha and Saarikivi, Olli and Saied, Amin and Salim, Adil and Santacroce, Michael and Shah, Shital and Shang, Ning and Sharma, Hiteshi and Song, Xianmin and Ruwase, Olatunji and Wang, Xin and Ward, Rachel and Wang, Guanhua and Witte, Philipp and Wyatt, Michael and Xu, Can and Xu, Jiahang and Yadav, Sonali and Yang, Fan and Yang, Ziyi and Yu, Donghan and Zhang, Cheng-Yuan and Zhang, Cyril and Zhang, Jianwen and Zhang, Li Lyna and Zhang, Yi and Zhang, Yunan and Zhou, Xiren},
  year = {2024},
  journal = {ArXiv},
}

OSDI

nnScaler: Constraint-Guided Parallelization Plan Generation for Deep Learning Training

Zhiqi Lin, and 13 more authors

In 18th USENIX Symposium on Operating Systems Design and Implementation, OSDI, Oct 2024

Bib HTML Code Slides

@inproceedings{nnscaler24,
  title = {nnScaler: Constraint-Guided Parallelization Plan Generation for Deep Learning Training},
  author = {Lin, Zhiqi and Miao, Youshan and Zhang, Quanlu and Yang, Fan and Zhu, Yi and Li, Cheng and Maleki, Saeed and Cao, Xu and Shang, Ning and Yang, Yilei and Xu, Weijiang and Yang, Mao and Zhang, Lintao and Zhou, Lidong},
  year = {2024},
  booktitle = {18th {USENIX} Symposium on Operating Systems Design and Implementation, {OSDI}},
}

OSDI

Parrot: Efficient Serving of LLM-based Applications with Semantic Variable

Chaofan Lin, and 6 more authors

In 18th USENIX Symposium on Operating Systems Design and Implementation, OSDI, Oct 2024

Hugging Face Daily Papers Bib HTML Code Slides

Selected as Hugging Face daily papers (2024-05-31).

@inproceedings{parrot24,
  title = {Parrot: Efficient Serving of LLM-based Applications with Semantic Variable},
  author = {Lin, Chaofan and Han, Zhenhua and Zhang, Chengruidong and Yang, Yuqing and Yang, Fan and Chen, Chen and Qiu, Lili},
  year = {2024},
  booktitle = {18th {USENIX} Symposium on Operating Systems Design and Implementation, {OSDI}},
}

OSDI

Ladder: Enabling Efficient Low-Precision Deep Learning Computing through Hardware-aware Tensor Transformation

Lei Wang, and 11 more authors

In 18th USENIX Symposium on Operating Systems Design and Implementation, OSDI, Oct 2024

Bib HTML Code

@inproceedings{ladder24,
  title = {Ladder: Enabling Efficient Low-Precision Deep Learning Computing through Hardware-aware Tensor Transformation},
  author = {Wang, Lei and Ma, Lingxiao and Cao, Shijie and Zhang, Quanlu and Xue, Jilong and Shi, Yining and Zheng, Ningxin and Miao, Ziming and Yang, Fan and Cao, Ting and Yang, Yuqing and Yang, Mao},
  year = {2024},
  booktitle = {18th {USENIX} Symposium on Operating Systems Design and Implementation, {OSDI}},
}

ASPLOS

Amanda: Unified Instrumentation Framework for Deep Neural Networks

Yue Guan, and 12 more authors

In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, ASPLOS’24, Oct 2024

Bib HTML

@inproceedings{10.1145/3617232.3624864,
  title = {Amanda: Unified Instrumentation Framework for Deep Neural Networks},
  author = {Guan, Yue and Qiu, Yuxian and Leng, Jingwen and Yang, Fan and Yu, Shuo and Liu, Yunxin and Feng, Yu and Zhu, Yuhao and Zhou, Lidong and Liang, Yun and Zhang, Chen and Li, Chao and Guo, Minyi},
  year = {2024},
  booktitle = {Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, ASPLOS'24},
}

EuroSys

Aceso: Efficient Parallel DNN Training through Iterative Bottleneck Alleviation

Guodong Liu, and 7 more authors

In Proceedings of the Nineteenth European Conference on Computer Systems, EuroSys, Oct 2024

Bib HTML

@inproceedings{conf/eurosys/LMLSMYBW00Y24,
  title = {Aceso: Efficient Parallel DNN Training through Iterative Bottleneck Alleviation},
  author = {Liu, Guodong and Miao, Youshan and Lin, Zhiqi and Shi, Xiaoxiang and Maleki, Saeed and Yang, Fan and Bao, Yungang and Wang, Sa},
  year = {2024},
  booktitle = {Proceedings of the Nineteenth European Conference on Computer Systems, {EuroSys}},
}

HPCA

Tessel: Boosting Distributed DNN Execution with Flexible Schedule Search

Zhiqi Lin, and 6 more authors

In 30th International Symposium on High-Performance Computer Architecture, HPCA, Oct 2024

Bib HTML Code

@inproceedings{conf/hpca/LMXLOMF00Y24,
  title = {Tessel: Boosting Distributed DNN Execution with Flexible Schedule Search},
  author = {Lin, Zhiqi and Miao, Youshan and Xu, Guanbin and Li, Cheng and Saarikivi, Olli and Maleki, Saeed and Yang, Fan},
  year = {2024},
  booktitle = {30th International Symposium on High-Performance Computer Architecture, {HPCA}},
}

ICME

Integer or Floating Point? New Outlooks for Low-Bit Quantization on Large Language Models

Yijia Zhang, and 8 more authors

In IEEE International Conference on Multimedia and Expo, ICME, Oct 2024

Bib HTML

@inproceedings{zhang2023integerfloatingpointnew,
  title = {Integer or Floating Point? New Outlooks for Low-Bit Quantization on Large Language Models},
  author = {Zhang, Yijia and Zhao, Lingran and Cao, Shijie and Wang, Wenqiang and Cao, Ting and Yang, Fan and Yang, Mao and Zhang, Shanghang and Xu, Ningyi},
  year = {2024},
  booktitle = {IEEE International Conference on Multimedia and Expo, {ICME}},
}

2023

ArXiv

Adam Accumulation to Reduce Memory Footprints of both Activations and Gradients for Large-scale DNN Training

Yijia Zhang, and 7 more authors

Oct 2023

Bib HTML

@misc{zhang2023adamaccumulationreducememory,
  title = {Adam Accumulation to Reduce Memory Footprints of both Activations and Gradients for Large-scale DNN Training},
  author = {Zhang, Yijia and Han, Yibo and Cao, Shijie and Dai, Guohao and Miao, Youshan and Cao, Ting and Yang, Fan and Xu, Ningyi},
  year = {2023},
  eprint = {2305.19982},
  archiveprefix = {arXiv},
  primaryclass = {cs.LG},
}

ArXiv

SuperScaler: Supporting Flexible DNN Parallelization via a Unified Abstraction

Zhiqi Lin, and 12 more authors

Oct 2023

Bib HTML

@misc{lin2023superscalersupportingflexiblednn,
  title = {SuperScaler: Supporting Flexible DNN Parallelization via a Unified Abstraction},
  author = {Lin, Zhiqi and Miao, Youshan and Liu, Guodong and Shi, Xiaoxiang and Zhang, Quanlu and Yang, Fan and Maleki, Saeed and Zhu, Yi and Cao, Xu and Li, Cheng and Yang, Mao and Zhang, Lintao and Zhou, Lidong},
  year = {2023},
  eprint = {2301.08984},
  archiveprefix = {arXiv},
  primaryclass = {cs.DC},
}

OSDI

VBASE: Unifying Online Vector Similarity Search and Relational Queries via Relaxed Monotonicity

Qianxi Zhang, and 11 more authors

In 17th USENIX Symposium on Operating Systems Design and Implementation, OSDI, Oct 2023

Bib HTML Code

@inproceedings{DBLP:conf/osdi/ZhangXCSXCCH00Y23,
  title = {{VBASE:} Unifying Online Vector Similarity Search and Relational Queries via Relaxed Monotonicity},
  author = {Zhang, Qianxi and Xu, Shuotao and Chen, Qi and Sui, Guoxin and Xie, Jiadong and Cai, Zhizhen and Chen, Yaoqi and He, Yinxuan and Yang, Yuqing and Yang, Fan and Yang, Mao and Zhou, Lidong},
  year = {2023},
  booktitle = {17th {USENIX} Symposium on Operating Systems Design and Implementation, {OSDI}},
}

OSDI

Cocktailer: Analyzing and Optimizing Dynamic Control Flow in Deep Learning

Chen Zhang, and 8 more authors

In 17th USENIX Symposium on Operating Systems Design and Implementation, OSDI, Oct 2023

Bib HTML

@inproceedings{DBLP:conf/osdi/ZhangMXSM0Z0Y23,
  title = {Cocktailer: Analyzing and Optimizing Dynamic Control Flow in Deep Learning},
  author = {Zhang, Chen and Ma, Lingxiao and Xue, Jilong and Shi, Yining and Miao, Ziming and Yang, Fan and Zhai, Jidong and Yang, Zhi and Yang, Mao},
  year = {2023},
  booktitle = {17th {USENIX} Symposium on Operating Systems Design and Implementation, {OSDI}},
}

OSDI

Welder: Scheduling Deep Learning Memory Access via Tile-graph

Yining Shi, and 8 more authors

In 17th USENIX Symposium on Operating Systems Design and Implementation, OSDI, Oct 2023

Bib HTML

@inproceedings{DBLP:conf/osdi/00010XMXMG0Z23,
  title = {Welder: Scheduling Deep Learning Memory Access via Tile-graph},
  author = {Shi, Yining and Yang, Zhi and Xue, Jilong and Ma, Lingxiao and Xia, Yuqing and Miao, Ziming and Guo, Yuxiao and Yang, Fan and Zhou, Lidong},
  year = {2023},
  booktitle = {17th {USENIX} Symposium on Operating Systems Design and Implementation, {OSDI}},
}

OSDI

Optimizing Dynamic Neural Networks with Brainstorm

Weihao Cui, and 13 more authors

In 17th USENIX Symposium on Operating Systems Design and Implementation, OSDI, Oct 2023

Bib HTML

@inproceedings{DBLP:conf/osdi/CuiHOWZM00XQZ0T23,
  title = {Optimizing Dynamic Neural Networks with Brainstorm},
  author = {Cui, Weihao and Han, Zhenhua and Ouyang, Lingji and Wang, Yichuan and Zheng, Ningxin and Ma, Lingxiao and Yang, Yuqing and Yang, Fan and Xue, Jilong and Qiu, Lili and Zhou, Lidong and Chen, Quan and Tan, Haisheng and Guo, Minyi},
  year = {2023},
  booktitle = {17th {USENIX} Symposium on Operating Systems Design and Implementation, {OSDI}},
}

ArXiv

BitNet: Scaling 1-bit Transformers for Large Language Models

Hongyu Wang, and 9 more authors

Oct 2023

Bib HTML

@article{bitnet23,
  title = {BitNet: Scaling 1-bit Transformers for Large Language Models},
  author = {Wang, Hongyu and Ma, Shuming and Dong, Li and Huang, Shaohan and Wang, Huaijie and Ma, Lingxiao and Yang, Fan and Wang, Ruiping and Wu, Yi and Wei, Furu},
  year = {2023},
}

SOSP

PIT: Optimization of Dynamic Sparse Deep Learning Models via Permutation Invariant Transformation

Ningxin Zheng, and 10 more authors

In Proceedings of the 29th Symposium on Operating Systems Principles, SOSP, Oct 2023

Bib HTML Slides

@inproceedings{DBLP:conf/sosp/ZhengJZHM0YZQYZ23,
  title = {{PIT:} Optimization of Dynamic Sparse Deep Learning Models via Permutation Invariant Transformation},
  author = {Zheng, Ningxin and Jiang, Huiqiang and Zhang, Quanlu and Han, Zhenhua and Ma, Lingxiao and Yang, Yuqing and Yang, Fan and Zhang, Chengruidong and Qiu, Lili and Yang, Mao and Zhou, Lidong},
  year = {2023},
  booktitle = {Proceedings of the 29th Symposium on Operating Systems Principles, {SOSP}},
}

SOSP

SPFresh: Incremental In-Place Update for Billion-Scale Vector Search

Yuming Xu, and 11 more authors

In Proceedings of the 29th Symposium on Operating Systems Principles, SOSP, Oct 2023

Bib HTML Slides

@inproceedings{DBLP:conf/sosp/XuLLXCZLYYYCY23,
  title = {SPFresh: Incremental In-Place Update for Billion-Scale Vector Search},
  author = {Xu, Yuming and Liang, Hengyu and Li, Jin and Xu, Shuotao and Chen, Qi and Zhang, Qianxi and Li, Cheng and Yang, Ziyue and Yang, Fan and Yang, Yuqing and Cheng, Peng and Yang, Mao},
  year = {2023},
  booktitle = {Proceedings of the 29th Symposium on Operating Systems Principles, {SOSP}},
}

EuroSys

SiloD: A Co-design of Caching and Scheduling for Deep Learning Clusters

Hanyu Zhao, and 11 more authors

In Proceedings of the Eighteenth European Conference on Computer Systems, EuroSys, Oct 2023

Bib HTML

@inproceedings{DBLP:conf/eurosys/ZhaoHYZ0YZL0QZZ23,
  title = {SiloD: {A} Co-design of Caching and Scheduling for Deep Learning Clusters},
  author = {Zhao, Hanyu and Han, Zhenhua and Yang, Zhi and Zhang, Quanlu and Li, Mingxia and Yang, Fan and Zhang, Qianxi and Li, Binyang and Yang, Yuqing and Qiu, Lili and Zhang, Lintao and Zhou, Lidong},
  year = {2023},
  booktitle = {Proceedings of the Eighteenth European Conference on Computer Systems, {EuroSys}},
}

ASPLOS

ElasticFlow: An Elastic Serverless Training Platform for Distributed Deep Learning

Diandian Gu, and 9 more authors

In Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2, ASPLOS, Oct 2023

Bib HTML

@inproceedings{DBLP:conf/asplos/GuZZXHCYHJL23,
  title = {ElasticFlow: An Elastic Serverless Training Platform for Distributed Deep Learning},
  author = {Gu, Diandian and Zhao, Yihao and Zhong, Yinmin and Xiong, Yifan and Han, Zhenhua and Cheng, Peng and Yang, Fan and Huang, Gang and Jin, Xin and Liu, Xuanzhe},
  year = {2023},
  booktitle = {Proceedings of the 28th {ACM} International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2, {ASPLOS}},
}

ISCA

OliVe: Accelerating Large Language Models via Hardware-friendly Outlier-Victim Pair Quantization

Cong Guo, and 8 more authors

In Proceedings of the 50th Annual International Symposium on Computer Architecture, ISCA, Oct 2023

Bib HTML

@inproceedings{DBLP:conf/isca/0003THL00LG023,
  title = {OliVe: Accelerating Large Language Models via Hardware-friendly Outlier-Victim Pair Quantization},
  author = {Guo, Cong and Tang, Jiaming and Hu, Weiming and Leng, Jingwen and Zhang, Chen and Yang, Fan and Liu, Yunxin and Guo, Minyi and Zhu, Yuhao},
  year = {2023},
  booktitle = {Proceedings of the 50th Annual International Symposium on Computer Architecture, {ISCA}},
}

NSDI

On Modular Learning of Distributed Systems for Predicting End-to-End Latency

Chieh-Jan Mike Liang, and 7 more authors

In 20th USENIX Symposium on Networked Systems Design and Implementation, NSDI, Oct 2023

Bib HTML

@inproceedings{DBLP:conf/nsdi/LiangFXYLZYZ23,
  title = {On Modular Learning of Distributed Systems for Predicting End-to-End Latency},
  author = {Liang, Chieh{-}Jan Mike and Fang, Zilin and Xie, Yuqing and Yang, Fan and Li, Zhao Lucis and Zhang, Li Lyna and Yang, Mao and Zhou, Lidong},
  year = {2023},
  booktitle = {20th {USENIX} Symposium on Networked Systems Design and Implementation, {NSDI}},
}

NeurIPS

Model-enhanced Vector Index

Hailin Zhang, and 18 more authors

In Advances in Neural Information Processing Systems, NeurIPS, Oct 2023

Bib HTML

@inproceedings{NEURIPS2023_ac112e8f,
  title = {Model-enhanced Vector Index},
  author = {Zhang, Hailin and Wang, Yujing and Chen, Qi and Chang, Ruiheng and Zhang, Ting and Miao, Ziming and Hou, Yingyan and Ding, Yang and Miao, Xupeng and Wang, Haonan and Pang, Bochen and Zhan, Yuefeng and Sun, Hao and Deng, Weiwei and Zhang, Qi and Yang, Fan and Xie, Xing and Yang, Mao and CUI, Bin},
  year = {2023},
  booktitle = {Advances in Neural Information Processing Systems, {NeurIPS}},
}

MLSys

Tutel: Adaptive Mixture-of-Experts at Scale

Changho Hwang, and 14 more authors

In Proceedings of Machine Learning and Systems, MLSys, Oct 2023

Bib HTML

@inproceedings{MLSYS2023_5616d34c,
  title = {Tutel: Adaptive Mixture-of-Experts at Scale},
  author = {Hwang, Changho and Cui, Wei and Xiong, Yifan and Yang, Ziyue and Liu, Ze and Hu, Han and Wang, Zilong and Salas, Rafael and Jose, Jithin and Ram, Prabhat and Chau, HoYuen and Cheng, Peng and Yang, Fan and Yang, Mao and Xiong, Yongqiang},
  year = {2023},
  booktitle = {Proceedings of Machine Learning and Systems, {MLSys}},
  publisher = {Curan},
  volume = {5},
  pages = {269--287},
  editor = {Song, D. and Carbin, M. and Chen, T.},
}

MLSys

Efficient GPU Kernels for N:M-Sparse Weights in Deep Learning

Bin Lin, and 10 more authors

In Proceedings of Machine Learning and Systems, Oct 2023

Bib HTML

@inproceedings{MLSYS2023_a10deb4d,
  title = {Efficient GPU Kernels for N:M-Sparse Weights in Deep Learning},
  author = {Lin, Bin and Zheng, Ningxin and Wang, Lei and Cao, Shijie and Ma, Lingxiao and Zhang, Quanlu and Zhu, Yi and Cao, Ting and Xue, Jilong and Yang, Yuqing and Yang, Fan},
  year = {2023},
  booktitle = {Proceedings of Machine Learning and Systems},
  publisher = {Curan},
  volume = {5},
  pages = {513--525},
  editor = {Song, D. and Carbin, M. and Chen, T.},
}

ACL
NUWA-XL: Diffusion over Diffusion for eXtremely Long Video Generation

Shengming Yin, and 15 more authors

In ACL 2023, Jul 2023

Abs Bib HTML

In this paper, we propose NUWA-XL, a novel Diffusion over Diffusion architecture for eXtremely Long video generation. Most current work generates long videos segment by segment sequentially, which normally leads to the gap between training on short videos and inferring long videos, and the sequential generation is inefficient. Instead, our approach adopts a “coarse-to-fine” process, in which the video can be generated in parallel at the same granularity. A global diffusion model is applied to generate the keyframes across the entire time range, and then local diffusion models recursively fill in the content between nearby frames. This simple yet effective strategy allows us to directly train on long videos (3376 frames) to reduce the training-inference gap, and makes it possible to generate all segments in parallel. To evaluate our model, we build FlintstonesHD dataset, a new benchmark for long video generation. Experiments show that our model not only generates high-quality long videos with both global and local coherence, but also decreases the average inference time from 7.55min to 26s (by 94.26%) at the same hardware setting when generating 1024 frames. The homepage link is https://msra-nuwa.azurewebsites.net/#/.
@inproceedings{yin2023nuwa-xl, title = {NUWA-XL: Diffusion over Diffusion for eXtremely Long Video Generation}, author = {Yin, Shengming and Wu, Chenfei and Yang, Huan and Wang, Jianfeng and Wang, Xiaodong and Ni, Minheng and Yang, Zhengyuan and Li, Linjie and Liu, Shuguang and Yang, Fan and Fu, Jianlong and Gong (YIMING), Ming and Wang, Lijuan and Liu, Zicheng and Li, Houqiang and Duan, Nan}, year = {2023}, month = jul, booktitle = {ACL 2023}, }
IJCAI
Learning 3D photography videos via self-supervised diffusion on single images

Xiaodong Wang, and 11 more authors

In Proceedings of the Thirty-Second International Joint Conference on Artificial Intelligence, Macao, P.R.China, Jul 2023

Abs Bib

3D photography renders a static image into a video with appealing 3D visual effects. Existing approaches typically first conduct monocular depth estimation, then render the input frame to subsequent frames with various viewpoints, and finally use an inpainting model to fill those missing/ occluded regions. The inpainting model plays a crucial role in rendering quality, but it is normally trained on out-of-domain data. To reduce the training and inference gap, we propose a novel self-supervised diffusion model as the inpainting module. Given a single input image, we automatically construct a training pair of the masked occluded image and the ground-truth image with random cycle rendering. The constructed training samples are closely aligned to the testing instances, without the need for data annotation. To make full use of the masked images, we designed a Masked Enhanced Block (MEB), which can be easily plugged into the UNet and enhance the semantic conditions. Towards real-world animation, we present a novel task: out-animation, which extends the space and time of input objects. Extensive experiments on real datasets show that our method achieves competitive results with existing SOTA methods.
@inproceedings{10.24963/ijcai.2023/167, title = {Learning 3D photography videos via self-supervised diffusion on single images}, author = {Wang, Xiaodong and Wu, Chenfei and Yin, Shengming and Ni, Minheng and Wang, Jianfeng and Li, Linjie and Yang, Zhengyuan and Yang, Fan and Wang, Lijuan and Liu, Zicheng and Fang, Yuejian and Duan, Nan}, year = {2023}, booktitle = {Proceedings of the Thirty-Second International Joint Conference on Artificial Intelligence}, location = {Macao, P.R.China}, series = {IJCAI '23}, doi = {10.24963/ijcai.2023/167}, isbn = {978-1-956792-03-4}, url = {https://doi.org/10.24963/ijcai.2023/167}, articleno = {167}, numpages = {9} }

2022

ICLR

SQuant: On-the-Fly Data-Free Quantization via Diagonal Hessian Approximation

Cong Guo, and 8 more authors

In The Tenth International Conference on Learning Representations, ICLR, Jul 2022

Bib HTML

@inproceedings{DBLP:conf/iclr/0003QLGZLY0G22,
  title = {SQuant: On-the-Fly Data-Free Quantization via Diagonal Hessian Approximation},
  author = {Guo, Cong and Qiu, Yuxian and Leng, Jingwen and Gao, Xiaotian and Zhang, Chen and Liu, Yunxin and Yang, Fan and Zhu, Yuhao and Guo, Minyi},
  year = {2022},
  booktitle = {The Tenth International Conference on Learning Representations, {ICLR}},
}

ICCD

Nesting Forward Automatic Differentiation for Memory-Efficient Deep Neural Network Training

Cong Guo, and 8 more authors

In 2022 IEEE 40th International Conference on Computer Design (ICCD), Jul 2022

Bib HTML

@inproceedings{9978319,
  title = {Nesting Forward Automatic Differentiation for Memory-Efficient Deep Neural Network Training},
  author = {Guo, Cong and Qiu, Yuxian and Leng, Jingwen and Zhang, Chen and Cao, Ying and Zhang, Quanlu and Liu, Yunxin and Yang, Fan and Guo, Minyi},
  year = {2022},
  booktitle = {2022 IEEE 40th International Conference on Computer Design (ICCD)},
  pages = {738--745},
}

MICRO

ANT: Exploiting Adaptive Numerical Data Type for Low-bit Deep Neural Network Quantization

Cong Guo, and 7 more authors

In 55th IEEE/ACM International Symposium on Microarchitecture, MICRO, Jul 2022

IEEE Micro Top Picks 2023 Honorable Mention Bib HTML

Highlighted as an IEEE Micro Top Picks Honorable Mention in the July/August special edition of IEEE Micro 2023

@inproceedings{DBLP:conf/micro/00030LL0LG022,
  title = {{ANT:} Exploiting Adaptive Numerical Data Type for Low-bit Deep Neural Network Quantization},
  author = {Guo, Cong and Zhang, Chen and Leng, Jingwen and Liu, Zihan and Yang, Fan and Liu, Yunxin and Guo, Minyi and Zhu, Yuhao},
  year = {2022},
  booktitle = {55th {IEEE/ACM} International Symposium on Microarchitecture, {MICRO}},
}

OSDI

ROLLER: Fast and Efficient Tensor Compilation for Deep Learning

Hongyu Zhu, and 14 more authors

In 16th USENIX Symposium on Operating Systems Design and Implementation, OSDI, Jul 2022

Bib HTML

@inproceedings{DBLP:conf/osdi/ZhuWDKLZXMXC0YZ22,
  title = {{ROLLER:} Fast and Efficient Tensor Compilation for Deep Learning},
  author = {Zhu, Hongyu and Wu, Ruofan and Diao, Yijia and Ke, Shanbin and Li, Haoyu and Zhang, Chen and Xue, Jilong and Ma, Lingxiao and Xia, Yuqing and Cui, Wei and Yang, Fan and Yang, Mao and Zhou, Lidong and Cidon, Asaf and Pekhimenko, Gennady},
  year = {2022},
  booktitle = {16th {USENIX} Symposium on Operating Systems Design and Implementation, {OSDI}},
}

USENIX ATC

PilotFish: Harvesting Free Cycles of Cloud Gaming with Deep Learning Training

Wei Zhang, and 8 more authors

In 2022 USENIX Annual Technical Conference, USENIX ATC, Jul 2022

Bib HTML

@inproceedings{DBLP:conf/usenix/0149CH000SYG22,
  title = {PilotFish: Harvesting Free Cycles of Cloud Gaming with Deep Learning Training},
  author = {Zhang, Wei and Chen, Binghao and Han, Zhenhua and Chen, Quan and Cheng, Peng and Yang, Fan and Shu, Ran and Yang, Yuqing and Guo, Minyi},
  year = {2022},
  booktitle = {2022 {USENIX} Annual Technical Conference, {USENIX} {ATC}},
}

OSDI

SparTA: Deep-Learning Model Sparsity via Tensor-with-Sparsity-Attribute

Ningxin Zheng, and 8 more authors

In 16th USENIX Symposium on Operating Systems Design and Implementation, OSDI, Jul 2022

Bib HTML

@inproceedings{DBLP:conf/osdi/ZhengLZMY0WYZ22,
  title = {Spar{TA}: Deep-Learning Model Sparsity via Tensor-with-Sparsity-Attribute},
  author = {Zheng, Ningxin and Lin, Bin and Zhang, Quanlu and Ma, Lingxiao and Yang, Yuqing and Yang, Fan and Wang, Yang and Yang, Mao and Zhou, Lidong},
  year = {2022},
  booktitle = {16th {USENIX} Symposium on Operating Systems Design and Implementation, {OSDI}},
}

SIGIR
Distill-VQ: Learning Retrieval Oriented Vector Quantization By Distilling Knowledge from Dense Embeddings

Shitao Xiao, and 10 more authors

In Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval, Madrid, Spain, Jul 2022

Abs Bib HTML

Vector quantization (VQ) based ANN indexes, such as Inverted File System (IVF) and Product Quantization (PQ), have been widely applied to embedding based document retrieval thanks to the competitive time and memory efficiency. Originally, VQ is learned to minimize the reconstruction loss, i.e., the distortions between the original dense embeddings and the reconstructed embeddings after quantization. Unfortunately, such an objective is inconsistent with the goal of selecting ground-truth documents for the input query, which may cause severe loss of retrieval quality. Recent works identify such a defect, and propose to minimize the retrieval loss through contrastive learning. However, these methods intensively rely on queries with ground-truth documents, whose performance is limited by the insufficiency of labeled data. In this paper, we propose Distill-VQ, which unifies the learning of IVF and PQ within a knowledge distillation framework. In Distill-VQ, the dense embeddings are leveraged as "teachers”, which predict the query’s relevance to the sampled documents. The VQ modules are treated as the "students”, which are learned to reproduce the predicted relevance, such that the reconstructed embeddings may fully preserve the retrieval result of the dense embeddings. By doing so, Distill-VQ is able to derive substantial training signals from the massive unlabeled data, which significantly contributes to the retrieval quality. We perform comprehensive explorations for the optimal conduct of knowledge distillation, which may provide useful insights for the learning of VQ based ANN index. We also experimentally show that the labeled data is no longer a necessity for high-quality vector quantization, which indicates Distill-VQ’s strong applicability in practice. The evaluations are performed on MS MARCO and Natural Questions benchmarks, where Distill-VQ notably outperforms the SOTA VQ methods in Recall and MRR. Our code is avaliable at https://github.com/staoxiao/LibVQ.
@inproceedings{10.1145/3477495.3531799, title = {Distill-VQ: Learning Retrieval Oriented Vector Quantization By Distilling Knowledge from Dense Embeddings}, author = {Xiao, Shitao and Liu, Zheng and Han, Weihao and Zhang, Jianjin and Lian, Defu and Gong, Yeyun and Chen, Qi and Yang, Fan and Sun, Hao and Shao, Yingxia and Xie, Xing}, year = {2022}, booktitle = {Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval}, location = {Madrid, Spain}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, series = {SIGIR '22}, pages = {1513–1523}, doi = {10.1145/3477495.3531799}, isbn = {9781450387323}, numpages = {11}, keywords = {approximate nearest neighbour search, embedding based retrieval, knowledge distillation, vector quantization} }
ECCV
NÜWA: Visual Synthesis Pre-training for Neural visUal World creAtion

Chenfei Wu, and 6 more authors

In Computer Vision – ECCV 2022, Jul 2022

Oral Abs Bib HTML

Selected as an ECCV Oral presentation. Only 158 papers (2.7% of submitted papers) were selected for oral presentation.

This paper presents a unified multimodal pre-trained model called NÜWA that can generate new or manipulate existing visual data (i.e., images and videos) for various visual synthesis tasks. To cover language, image, and video at the same time for different scenarios, a 3D transformer encoder-decoder framework is designed, which can not only deal with videos as 3D data but also adapt to texts and images as 1D and 2D data, respectively. A 3D Nearby Attention (3DNA) mechanism is also proposed to consider the nature of the visual data and reduce the computational complexity. We evaluate NÜWA on 8 downstream tasks. Compared to several strong baselines, NÜWA achieves state-of-the-art results on text-to-image generation, text-to-video generation, video prediction, etc. Furthermore, it also shows surprisingly good zero-shot capabilities on text-guided image and video manipulation tasks.
@inproceedings{10.1007/978-3-031-19787-1_41, title = {N{\"U}WA: Visual Synthesis Pre-training for Neural visUal World creAtion}, author = {Wu, Chenfei and Liang, Jian and Ji, Lei and Yang, Fan and Fang, Yuejian and Jiang, Daxin and Duan, Nan}, year = {2022}, booktitle = {Computer Vision -- ECCV 2022}, publisher = {Springer Nature Switzerland}, address = {Cham}, pages = {720--736}, isbn = {978-3-031-19787-1}, editor = {Avidan, Shai and Brostow, Gabriel and Ciss{\'e}, Moustapha and Farinella, Giovanni Maria and Hassner, Tal}, }

2021

ArXiv

GODIVA: Generating Open-DomaIn Videos from nAtural Descriptions

Chenfei Wu, and 7 more authors

arXiv, Jul 2021

Bib HTML

@article{wu2021godivageneratingopendomainvideos,
  title = {GODIVA: Generating Open-DomaIn Videos from nAtural Descriptions},
  author = {Wu, Chenfei and Huang, Lun and Zhang, Qianxi and Li, Binyang and Ji, Lei and Yang, Fan and Sapiro, Guillermo and Duan, Nan},
  year = {2021},
  journal = {arXiv},
}

2020

EMNLP
XGLUE: A New Benchmark Dataset for Cross-lingual Pre-training, Understanding and Generation

Yaobo Liang, and 23 more authors

In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), Nov 2020

Abs Bib HTML

In this paper, we introduce XGLUE, a new benchmark dataset to train large-scale cross-lingual pre-trained models using multilingual and bilingual corpora, and evaluate their performance across a diverse set of cross-lingual tasks. Comparing to GLUE (Wang et al.,2019), which is labeled in English and includes natural language understanding tasks only, XGLUE has three main advantages: (1) it provides two corpora with different sizes for cross-lingual pre-training; (2) it provides 11 diversified tasks that cover both natural language understanding and generation scenarios; (3) for each task, it provides labeled data in multiple languages. We extend a recent cross-lingual pre-trained model Unicoder (Huang et al., 2019) to cover both understanding and generation tasks, which is evaluated on XGLUE as a strong baseline. We also evaluate the base versions (12-layer) of Multilingual BERT, XLM and XLM-R for comparison.
@inproceedings{liang-etal-2020-xglue, title = {{XGLUE}: A New Benchmark Dataset for Cross-lingual Pre-training, Understanding and Generation}, author = {Liang, Yaobo and Duan, Nan and Gong, Yeyun and Wu, Ning and Guo, Fenfei and Qi, Weizhen and Gong, Ming and Shou, Linjun and Jiang, Daxin and Cao, Guihong and Fan, Xiaodong and Zhang, Ruofei and Agrawal, Rahul and Cui, Edward and Wei, Sining and Bharti, Taroon and Qiao, Ying and Chen, Jiun-Hung and Wu, Winnie and Liu, Shuguang and Yang, Fan and Campos, Daniel and Majumder, Rangan and Zhou, Ming}, year = {2020}, month = nov, booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)}, publisher = {Association for Computational Linguistics}, address = {Online}, pages = {6008--6018}, doi = {10.18653/v1/2020.emnlp-main.484}, editor = {Webber, Bonnie and Cohn, Trevor and He, Yulan and Liu, Yang}, }

ASPLOS

Capuchin: Tensor-based GPU Memory Management for Deep Learning

Xuan Peng, and 7 more authors

In ASPLOS ’20: Architectural Support for Programming Languages and Operating Systems, ASPLOS, Nov 2020

Bib HTML

@inproceedings{DBLP:conf/asplos/PengSD0MXYQ20,
  title = {Capuchin: Tensor-based {GPU} Memory Management for Deep Learning},
  author = {Peng, Xuan and Shi, Xuanhua and Dai, Hulin and Jin, Hai and Ma, Weiliang and Xiong, Qian and Yang, Fan and Qian, Xuehai},
  year = {2020},
  booktitle = {{ASPLOS} '20: Architectural Support for Programming Languages and Operating Systems, {ASPLOS}},
}

OSDI

HiveD: Sharing a GPU Cluster for Deep Learning with Guarantees

Hanyu Zhao, and 10 more authors

In 14th USENIX Symposium on Operating Systems Design and Implementation, OSDI, Nov 2020

Bib HTML

@inproceedings{DBLP:conf/osdi/ZhaoHYZYZYLWXW20,
  title = {Hive{D}: Sharing a {GPU} Cluster for Deep Learning with Guarantees},
  author = {Zhao, Hanyu and Han, Zhenhua and Yang, Zhi and Zhang, Quanlu and Yang, Fan and Zhou, Lidong and Yang, Mao and Lau, Francis C. M. and Wang, Yuqi and Xiong, Yifan and Wang, Bin},
  year = {2020},
  booktitle = {14th {USENIX} Symposium on Operating Systems Design and Implementation, {OSDI}},
}

OSDI

Rammer: Enabling Holistic Deep Learning Compiler Optimizations with rTasks

Lingxiao Ma, and 9 more authors

In 14th USENIX Symposium on Operating Systems Design and Implementation, OSDI, Nov 2020

Bib HTML

@inproceedings{DBLP:conf/osdi/MaXYXMCHYZZ20,
  title = {Rammer: Enabling Holistic Deep Learning Compiler Optimizations with rTasks},
  author = {Ma, Lingxiao and Xie, Zhiqiang and Yang, Zhi and Xue, Jilong and Miao, Youshan and Cui, Wei and Hu, Wenxiang and Yang, Fan and Zhang, Lintao and Zhou, Lidong},
  year = {2020},
  booktitle = {14th {USENIX} Symposium on Operating Systems Design and Implementation, {OSDI}},
}

OSDI

Retiarii: A Deep Learning Exploratory-Training Framework

Quanlu Zhang, and 6 more authors

In 14th USENIX Symposium on Operating Systems Design and Implementation, OSDI, Nov 2020

Bib HTML

@inproceedings{DBLP:conf/osdi/ZhangHYZLYZ20,
  title = {Retiarii: {A} Deep Learning Exploratory-Training Framework},
  author = {Zhang, Quanlu and Han, Zhenhua and Yang, Fan and Zhang, Yuge and Liu, Zhe and Yang, Mao and Zhou, Lidong},
  year = {2020},
  booktitle = {14th {USENIX} Symposium on Operating Systems Design and Implementation, {OSDI}},
}

2019

USENIX ATC

Analysis of Large-Scale Multi-Tenant GPU Clusters for DNN Training Workloads

Myeongjae Jeon, and 5 more authors

In 2019 USENIX Annual Technical Conference, USENIX ATC, Nov 2019

Bib HTML

@inproceedings{DBLP:conf/usenix/JeonVPQXY19,
  title = {Analysis of Large-Scale Multi-Tenant {GPU} Clusters for {DNN} Training Workloads},
  author = {Jeon, Myeongjae and Venkataraman, Shivaram and Phanishayee, Amar and Qian, Junjie and Xiao, Wencong and Yang, Fan},
  year = {2019},
  booktitle = {2019 {USENIX} Annual Technical Conference, {USENIX} {ATC}},
}

2018

OSDI

Gandiva: Introspective Cluster Scheduling for Deep Learning

Wencong Xiao, and 11 more authors

In 13th USENIX Symposium on Operating Systems Design and Implementation, OSDI, Nov 2018

Bib HTML

@inproceedings{DBLP:conf/osdi/XiaoBRSKHPPZZYZ18,
  title = {Gandiva: Introspective Cluster Scheduling for Deep Learning},
  author = {Xiao, Wencong and Bhardwaj, Romil and Ramjee, Ramachandran and Sivathanu, Muthian and Kwatra, Nipun and Han, Zhenhua and Patel, Pratyush and Peng, Xuan and Zhao, Hanyu and Zhang, Quanlu and Yang, Fan and Zhou, Lidong},
  year = {2018},
  booktitle = {13th {USENIX} Symposium on Operating Systems Design and Implementation, {OSDI}},
}

Poster

Scheduling CPU for GPU-based Deep Learning Jobs

Wencong Xiao, and 6 more authors

In Proceedings of the ACM Symposium on Cloud Computing (SoCC) Poster, Carlsbad, CA, USA, Nov 2018

Bib

@inproceedings{10.1145/3267809.3275445,
  title = {Scheduling CPU for GPU-based Deep Learning Jobs},
  author = {Xiao, Wencong and Han, Zhenhua and Zhao, Hanyu and Peng, Xuan and Zhang, Quanlu and Yang, Fan and Zhou, Lidong},
  year = {2018},
  booktitle = {Proceedings of the ACM Symposium on Cloud Computing (SoCC) Poster},
  location = {Carlsbad, CA, USA},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  series = {SoCC '18},
  pages = {503},
  doi = {10.1145/3267809.3275445},
  isbn = {9781450360111},
  url = {https://doi.org/10.1145/3267809.3275445},
  numpages = {1},
  keywords = {resource scheduling, deep learning, CPU}
}

2015

SoCC
GraM: scaling graph computation to the trillions

Ming Wu, and 8 more authors

In Proceedings of the Sixth ACM Symposium on Cloud Computing, SoCC, Kohala Coast, Hawaii, Nov 2015

Abs Bib HTML

GraM is an efficient and scalable graph engine for a large class of widely used graph algorithms. It is designed to scale up to multicores on a single server, as well as scale out to multiple servers in a cluster, offering significant, often over an order-of-magnitude, improvement over existing distributed graph engines on evaluated graph algorithms. GraM is also capable of processing graphs that are significantly larger than previously reported. In particular, using 64 servers (1,024 physical cores), it performs a PageRank iteration in 140 seconds on a synthetic graph with over one trillion edges, setting a new milestone for graph engines.GraM’s efficiency and scalability comes from a judicious architectural design that exploits the benefits of multi-core and RDMA. GraM uses a simple message-passing based scaling architecture for both scaling up and scaling out to expose inherent parallelism. It further benefits from a specially designed multi-core aware RDMA-based communication stack that preserves parallelism in a balanced way and allows overlapping of communication and computation. A high degree of parallelism often comes at the cost of lower efficiency due to resource fragmentation. GraM is equipped with an adaptive mechanism that evaluates the cost and benefit of parallelism to decide the appropriate configuration. Combined, these mechanisms allow GraM to scale up and out with high efficiency.
@inproceedings{10.1145/2806777.2806849, title = {GraM: scaling graph computation to the trillions}, author = {Wu, Ming and Yang, Fan and Xue, Jilong and Xiao, Wencong and Miao, Youshan and Wei, Lan and Lin, Haoxiang and Dai, Yafei and Zhou, Lidong}, year = {2015}, booktitle = {Proceedings of the Sixth ACM Symposium on Cloud Computing, {SoCC}}, location = {Kohala Coast, Hawaii}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, series = {SoCC '15}, pages = {408–421}, doi = {10.1145/2806777.2806849}, isbn = {9781450336512}, numpages = {14}, keywords = {RDMA, graph computation engine, scalability} }
ToS
ImmortalGraph: A System for Storage and Analysis of Temporal Graphs

Youshan Miao, and 8 more authors

ACM Trans. Storage, Jul 2015

Abs Bib HTML

Temporal graphs that capture graph changes over time are attracting increasing interest from research communities, for functions such as understanding temporal characteristics of social interactions on a time-evolving social graph. ImmortalGraph is a storage and execution engine designed and optimized specifically for temporal graphs. Locality is at the center of ImmortalGraph’s design: temporal graphs are carefully laid out in both persistent storage and memory, taking into account data locality in both time and graph-structure dimensions. ImmortalGraph introduces the notion of locality-aware batch scheduling in computation, so that common “bulk” operations on temporal graphs are scheduled to maximize the benefit of in-memory data locality. The design of ImmortalGraph explores an interesting interplay among locality, parallelism, and incremental computation in supporting common mining tasks on temporal graphs. The result is a high-performance temporal-graph system that is up to 5 times more efficient than existing database solutions for graph queries. The locality optimizations in ImmortalGraph offer up to an order of magnitude speedup for temporal iterative graph mining compared to a straightforward application of existing graph engines on a series of snapshots.
@article{10.1145/2700302, title = {ImmortalGraph: A System for Storage and Analysis of Temporal Graphs}, author = {Miao, Youshan and Han, Wentao and Li, Kaiwei and Wu, Ming and Yang, Fan and Zhou, Lidong and Prabhakaran, Vijayan and Chen, Enhong and Chen, Wenguang}, year = {2015}, month = jul, journal = {ACM Trans. Storage}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, volume = {11}, number = {3}, doi = {10.1145/2700302}, issn = {1553-3077}, issue_date = {July 2015}, articleno = {14}, numpages = {34}, keywords = {temporal graph, graph algorithms, Concurrent computing} }

2014

EuroSys
Chronos: a graph engine for temporal graph analysis

Wentao Han, and 8 more authors

In Proceedings of the Ninth European Conference on Computer Systems, Amsterdam, The Netherlands, Jul 2014

Abs Bib HTML

Temporal graphs capture changes in graphs over time and are becoming a subject that attracts increasing interest from the research communities, for example, to understand temporal characteristics of social interactions on a time-evolving social graph. Chronos is a storage and execution engine designed and optimized specifically for running in-memory iterative graph computation on temporal graphs. Locality is at the center of the Chronos design, where the in-memory layout of temporal graphs and the scheduling of the iterative computation on temporal graphs are carefully designed, so that common "bulk" operations on temporal graphs are scheduled to maximize the benefit of in-memory data locality. The design of Chronos further explores the interesting interplay among locality, parallelism, and incremental computation in supporting common mining tasks on temporal graphs. The result is a high-performance temporal-graph system that offers up to an order of magnitude speedup for temporal iterative graph mining compared to a straightforward application of existing graph engines on a series of snapshots.
@inproceedings{10.1145/2592798.2592799, title = {Chronos: a graph engine for temporal graph analysis}, author = {Han, Wentao and Miao, Youshan and Li, Kaiwei and Wu, Ming and Yang, Fan and Zhou, Lidong and Prabhakaran, Vijayan and Chen, Wenguang and Chen, Enhong}, year = {2014}, booktitle = {Proceedings of the Ninth European Conference on Computer Systems}, location = {Amsterdam, The Netherlands}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, series = {EuroSys '14}, doi = {10.1145/2592798.2592799}, isbn = {9781450327046}, articleno = {1}, numpages = {14} }
TechReport
Arming Cloud Services with Task Aspects

Zhenyu Guo, and 7 more authors

Nov 2014

Abs Bib HTML

Our cloud services are losing too many battles to faults like software bugs, resource interference, and hardware failures. Many tools can help us win these battles: model checkers to verify, fault injection to find bugs, replay to debug, and many more. Unfortunately, tools are currently afterthoughts in cloud service designs that must either be tediously tangled into service implementations or integrated transparently in ways that fail to effectively capture the service’s problematic non-deterministic (concurrent, asynchronous, and resource access) behavior. This paper makes tooling a first-class concern by having services encoded with tasks whose interactions reliably capture all non-deterministic behavior needed by tools. Task interactions are then exposed in aspects that are useful in encoding cross-cutting behavior; combined, tools encoded as task aspects can integrate with services effectively and transparently. We show how task aspects can be used to ease the development of an online production data service that runs on a hundred machines.
@techreport{guo2014arming, title = {Arming Cloud Services with Task Aspects}, author = {Guo, Zhenyu and Chen, Cheng and Lin, Haoxiang and McDirmid, Sean and Yang, Fan and Guo, Xueying and Yang, Mao and Zhou, Lidong}, year = {2014}, month = nov, number = {MSR-TR-2014-150}, institution = {Microsoft}, }

2012

EuroSys
Kineograph: taking the pulse of a fast-changing and connected world

Raymond Cheng, and 9 more authors

In Proceedings of the 7th ACM European Conference on Computer Systems, Bern, Switzerland, Nov 2012

Abs Bib HTML

Kineograph is a distributed system that takes a stream of incoming data to construct a continuously changing graph, which captures the relationships that exist in the data feed. As a computing platform, Kineograph further supports graph-mining algorithms to extract timely insights from the fast-changing graph structure. To accommodate graph-mining algorithms that assume a static underlying graph, Kineograph creates a series of consistent snapshots, using a novel and efficient epoch commit protocol. To keep up with continuous updates on the graph, Kineograph includes an incremental graph-computation engine. We have developed three applications on top of Kineograph to analyze Twitter data: user ranking, approximate shortest paths, and controversial topic detection. For these applications, Kineograph takes a live Twitter data feed and maintains a graph of edges between all users and hashtags. Our evaluation shows that with 40 machines processing 100K tweets per second, Kineograph is able to continuously compute global properties, such as user ranks, with less than 2.5-minute timeliness guarantees. This rate of traffic is more than 10 times the reported peak rate of Twitter as of October 2011.
@inproceedings{10.1145/2168836.2168846, title = {Kineograph: taking the pulse of a fast-changing and connected world}, author = {Cheng, Raymond and Hong, Ji and Kyrola, Aapo and Miao, Youshan and Weng, Xuetian and Wu, Ming and Yang, Fan and Zhou, Lidong and Zhao, Feng and Chen, Enhong}, year = {2012}, booktitle = {Proceedings of the 7th ACM European Conference on Computer Systems}, location = {Bern, Switzerland}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, series = {EuroSys '12}, pages = {85–98}, doi = {10.1145/2168836.2168846}, isbn = {9781450312233}, numpages = {14}, keywords = {distributed storage, graph processing} }
TechReport
Sonora: A Platform for Continuous Mobile-Cloud Computing

Xiuwei Chen, and 7 more authors

Mar 2012

Abs Bib HTML

This paper presents Sonora, a platform for mobile-cloud computing. Sonora is designed to support the development and execution of continuous mobile-cloud services. To this end, Sonora provides developers with stream-based programming interfaces that coherently integrate a broad range of existing techniques from mobile, database, and distributed systems. These range from support for disconnected operation to relational and event-driven models. Sonora’s execution engine is a fault-tolerant distributed runtime that supports user-facing continuous sensing and processing services in the cloud. Key features of this engine are its dynamic load balancing mechanisms, and a novel failure recovery protocol that performs checkpoint-based partial rollback recovery with selective re-execution. To illustrate the relevance and power of the stream abstraction in describing complex mobile-cloud services we evaluate Sonora’s design in the context of two services. We also validate Sonora’s design, demonstrating that Sonora is efficient, scalable, and provides responsive fault tolerance.
@techreport{chen2012sonora, title = {Sonora: A Platform for Continuous Mobile-Cloud Computing}, author = {Chen, Xiuwei and Beschastnikh, Ivan and Zhuang, Li and Yang, Fan and Qian, Zhengping and Zhou, Lidong and Shen, Guobin and Shen, Jacky}, year = {2012}, month = mar, number = {MSR-TR-2012-34}, }

2007

VTC

Distributed Cooperative Rate Adaptation for Energy Efficiency in IEEE 802.11-Based Multihop Networks

Kun Wang, and 4 more authors

IEEE Transactions on Vehicular Technology, Mar 2007

Bib HTML

@article{4138031,
  title = {Distributed Cooperative Rate Adaptation for Energy Efficiency in IEEE 802.11-Based Multihop Networks},
  author = {Wang, Kun and Yang, Fan and Zhang, Qian and Wu, Dapeng Oliver and Xu, Yinlong},
  year = {2007},
  journal = {IEEE Transactions on Vehicular Technology},
  volume = {56},
  number = {2},
  pages = {888--898},
  doi = {10.1109/TVT.2007.891422},
  keywords = {Energy efficiency;Spread spectrum communication;Bit error rate;Power control;Physical layer;Throughput;Wireless LAN;Telecommunication traffic;NP-hard problem;Energy consumption;Cooperation;energy efficiency;IEEE 802.11;rate adaptation;wireless multihop network}
}

IEEENetwork

Cooperative and opportunistic transmission for wireless ad hoc networks

Qian Zhang, and 4 more authors

IEEE Network, Mar 2007

Bib HTML

@article{4107915,
  title = {Cooperative and opportunistic transmission for wireless ad hoc networks},
  author = {Zhang, Qian and Chen, Qing and Yang, Fan and Shen, Xuemin and Niu, Zhisheng},
  year = {2007},
  journal = {IEEE Network},
  volume = {21},
  number = {1},
  pages = {14--20},
  doi = {10.1109/MNET.2007.314533},
  keywords = {Mobile ad hoc networks;Ad hoc networks;Usability;Fading;Interchannel interference;Centralized control;Energy consumption;Time-varying channels;System performance;Throughput}
}

TWC

Modeling path capacity in multi-hop IEEE 802.11 networks for QoS services

Kun Wang, and 3 more authors

IEEE Transactions on Wireless Communications, Mar 2007

Bib HTML

@article{4100182,
  title = {Modeling path capacity in multi-hop IEEE 802.11 networks for QoS services},
  author = {Wang, Kun and Yang, Fan and Zhang, Qian and Xu, Yinlong},
  year = {2007},
  journal = {IEEE Transactions on Wireless Communications},
  volume = {6},
  number = {2},
  pages = {738--749},
  doi = {10.1109/TWC.2007.05434},
  keywords = {Spread spectrum communication;Telecommunication traffic;Traffic control;Interference;Communication system traffic control;Streaming media;Analytical models;Computer science;Bandwidth;Wireless mesh networks}
}

2006

QShine
Distributed cooperative rate adaptation for energy efficiency in IEEE 802.11-based multi-hop networks

Kun Wang, and 4 more authors

In Proceedings of the 3rd International Conference on Quality of Service in Heterogeneous Wired/Wireless Networks, QShine, Waterloo, Ontario, Canada, Mar 2006

Best paper Abs Bib HTML

Best paper award

In this paper we study the problem of using the rate adaptation technique to achieve energy efficiency in an IEEE 802.11-based multi-hop network. Specifically, we formulate it as an optimization problem, i.e., minimizing the total transmission power over transmission data rates, subject to the traffic requirements of all the nodes in a multi-hop network. Interestingly, we can show that this problem is actually a well-known multiple-choice knapsack problem, which is proven to be an NP-hard problem. So, instead of finding an optimal solution, which is NP-hard, we seek a sub-optimal solution. Our key technique to attack this problem is distributed cooperative rate adaptation. Here, we promote node cooperation due to our observation that the inequality in non-cooperative channel contention among nodes caused by hidden terminal phenomenon in a multi-hop network tends to result in energy inefficiency. Under this design philosophy, we propose a distributed cooperative rate adaptation (CRA) scheme and prove that it converges. Simulation results show that our CRA scheme can reduce the power consumption up to 86% as compared to the existing (non-cooperative) algorithm.
@inproceedings{10.1145/1185373.1185375, title = {Distributed cooperative rate adaptation for energy efficiency in IEEE 802.11-based multi-hop networks}, author = {Wang, Kun and Yang, Fan and Zhang, Qian and Wu, Dapeng Oliver and Xu, Yinlong}, year = {2006}, booktitle = {Proceedings of the 3rd International Conference on Quality of Service in Heterogeneous Wired/Wireless Networks, {QShine}}, location = {Waterloo, Ontario, Canada}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, series = {QShine '06}, pages = {1–es}, doi = {10.1145/1185373.1185375}, isbn = {1595935371}, }

Globecom

On Improving the Throughput of Media Delivery Applications in Heterogeneous Overlay Network

Jin Zhao, and 3 more authors

In IEEE Globecom 2006, Mar 2006

Bib HTML

@inproceedings{4150846,
  title = {On Improving the Throughput of Media Delivery Applications in Heterogeneous Overlay Network},
  author = {Zhao, Jin and Yang, Fan and Zhang, Qian and Zhang, Zhensheng},
  year = {2006},
  booktitle = {IEEE Globecom 2006},
  volume = {},
  number = {},
  pages = {1--6},
  doi = {10.1109/GLOCOM.2006.216},
  keywords = {Throughput;Nonhomogeneous media;Network coding;Peer to peer computing;Bandwidth;Internet;Organizing;Mathematical programming;Asia;Heuristic algorithms},
}

JSAC

Distributed Channel Assignment and Routing in Multiradio Multichannel Multihop Wireless Networks

H. Wu, and 5 more authors

IEEE Journal on Selected Areas in Communications, Mar 2006

Bib HTML

@article{1717612,
  title = {Distributed Channel Assignment and Routing in Multiradio Multichannel Multihop Wireless Networks},
  author = {Wu, H. and Yang, F. and Tan, K. and Chen, J. and Zhang, Q. and Zhang, Z.},
  year = {2006},
  journal = {IEEE Journal on Selected Areas in Communications},
  volume = {24},
  number = {11},
  pages = {1972--1983},
  doi = {10.1109/JSAC.2006.881638},
  keywords = {Spread spectrum communication;Wireless networks;Interference;Costs;Routing protocols;Wireless application protocol;Hardware;Network interfaces;Telecommunication traffic;Coordinate measuring machines;Channel assignment;multihop;multiradio;routing;wireless network}
}

ChinaCom

Next generation mobile multimedia communications: Media codec and media transport perspectives

Feng Wu, and 4 more authors

China Communications, Mar 2006

Bib HTML

@article{wu2006next,
  title = {Next generation mobile multimedia communications: Media codec and media transport perspectives},
  author = {Wu, Feng and Shen, Guobin and Tan, Kun and Yang, Fan and Li, Shipeng},
  year = {2006},
  journal = {China Communications},
  publisher = {Citeseer},
  volume = {3},
  pages = {30--44},
}

ToM

LION: Layered Overlay Multicast With Network Coding

J. Zhao, and 4 more authors

IEEE Transactions on Multimedia, Mar 2006

Bib HTML

@article{1703516,
  title = {LION: Layered Overlay Multicast With Network Coding},
  author = {Zhao, J. and Yang, F. and Zhang, Q. and Zhang, Z. and Zhang, F.},
  year = {2006},
  journal = {IEEE Transactions on Multimedia},
  volume = {8},
  number = {5},
  pages = {1021--1032},
  doi = {10.1109/TMM.2006.879847},
  keywords = {Network coding;Throughput;Bandwidth;Organizing;Computer science;Information theory;Mathematical programming;Heuristic algorithms;Network topology;Peer to peer computing;Heterogeneity;network coding;overlay multicast}
}

ICC

Impact of Power and Rate Selection on the Throughput of Ad Hoc Networks

Cong Peng, and 5 more authors

In 2006 IEEE International Conference on Communications, Mar 2006

Bib HTML

@inproceedings{4025092,
  title = {Impact of Power and Rate Selection on the Throughput of Ad Hoc Networks},
  author = {Peng, Cong and Yang, Fan and Zhang, Qian and Wu, Dapeng and Zhao, Ming and Yao, Yan},
  year = {2006},
  booktitle = {2006 IEEE International Conference on Communications},
  volume = {9},
  number = {},
  pages = {3897--3902},
  doi = {10.1109/ICC.2006.255690},
  keywords = {Throughput;Ad hoc networks;Physical layer;Iterative algorithms;Wireless networks;Mobile ad hoc networks;Computer networks;Computational modeling;Power control;Communication system traffic control;Wireless ad hoc networks;capacity;transmission power control;rate adaptation}
}

2005

EURASIP

Cross-layer QoS support for multimedia delivery over wireless Internet

Qian Zhang, and 2 more authors

EURASIP Journal on Advances in Signal Processing, Mar 2005

Bib HTML

@article{zhang2005cross,
  title = {Cross-layer QoS support for multimedia delivery over wireless Internet},
  author = {Zhang, Qian and Yang, Fan and Zhu, Wenwu},
  year = {2005},
  journal = {EURASIP Journal on Advances in Signal Processing},
  publisher = {Springer},
  volume = {2005},
  pages = {1--13},
}

ICC

AMTP: a multipath multimedia streaming protocol for mobile ad hoc networks

K. Rojviboonchai, and 4 more authors

In IEEE International Conference on Communications, 2005. ICC 2005. 2005, Mar 2005

Bib HTML

@inproceedings{1494546,
  title = {AMTP: a multipath multimedia streaming protocol for mobile ad hoc networks},
  author = {Rojviboonchai, K. and Yang, Fan and Zhang, Qian and Aida, H. and Zhu, Wenwu},
  year = {2005},
  booktitle = {IEEE International Conference on Communications, 2005. ICC 2005. 2005},
  volume = {2},
  number = {},
  pages = {1246--1250 Vol. 2},
  doi = {10.1109/ICC.2005.1494546},
  keywords = {Streaming media;Mobile ad hoc networks;Ad hoc networks;Transport protocols;Aggregates;Throughput;Switches;Spread spectrum communication;Wireless networks;Network topology}
}

2004

JSAC

End-to-end TCP-friendly streaming protocol and bit allocation for scalable video over wireless Internet

Fan Yang, and 3 more authors

IEEE Journal on Selected Areas in Communications, Mar 2004

Bib HTML

@article{1295064,
  title = {End-to-end TCP-friendly streaming protocol and bit allocation for scalable video over wireless Internet},
  author = {Yang, Fan and Zhang, Qian and Zhu, Wenwu and Zhang, Ya-Qin},
  year = {2004},
  journal = {IEEE Journal on Selected Areas in Communications},
  volume = {22},
  number = {4},
  pages = {777--790},
  doi = {10.1109/JSAC.2004.826008},
  keywords = {Streaming media;Wireless application protocol;Bit rate;Internet;IP networks;Wireless networks;Convergence;Resource management;Information filtering;Information filters}
}

INFOCOM

Bit allocation for scalable video streaming over mobile wireless Internet

Fan Yang, and 3 more authors

In IEEE INFOCOM 2004, Mar 2004

Bib HTML

@inproceedings{1354621,
  title = {Bit allocation for scalable video streaming over mobile wireless Internet},
  author = {Yang, Fan and Zhang, Qian and Zhu, Wenwu and Zhang, Ya-Qin},
  year = {2004},
  booktitle = {IEEE INFOCOM 2004},
  volume = {3},
  number = {https://ieeexplore.ieee.org/abstract/document/1354621},
  pages = {2142--2151 vol.3},
  doi = {10.1109/INFCOM.2004.1354621},
  keywords = {Bit rate;Streaming media;Internet;IP networks;Wireless networks;Wireless application protocol;Propagation losses;Convergence;Resource management;Smoothing methods}
}

2003

ICME

An end-to-end TCP-friendly streaming protocol for multimedia over wireless Internet

Fan Yang, and 3 more authors

In 2003 International Conference on Multimedia and Expo. ICME ’03. Proceedings (Cat. No.03TH8698), Mar 2003

Bib HTML

@inproceedings{1221645,
  title = {An end-to-end TCP-friendly streaming protocol for multimedia over wireless Internet},
  author = {Yang, Fan and Zhang, Qian and Zhu, Wenwu and Zhang, Ya-Qin},
  year = {2003},
  booktitle = {2003 International Conference on Multimedia and Expo. ICME '03. Proceedings (Cat. No.03TH8698)},
  volume = {2},
  number = {},
  pages = {II-429},
  doi = {10.1109/ICME.2003.1221645},
  keywords = {Streaming media;Wireless application protocol;Internet;Convergence;IP networks;Wireless networks;Information filtering;Information filters;Performance analysis;Analytical models}
}

2001

3GWireless

An efficient transport scheme for multimedia over wireless internet

Fan Yang, and 3 more authors

In Proceedings of 2001 IEEE International Conference on 3G Wireless and Beyond, Mar 2001

Bib HTML

@inproceedings{yang2001efficient,
  title = {An efficient transport scheme for multimedia over wireless internet},
  author = {Yang, Fan and Zhang, Qian and Zhu, Wenwu and Zhang, Ya-Qin},
  year = {2001},
  booktitle = {Proceedings of 2001 IEEE International Conference on 3G Wireless and Beyond},
  pages = {651},
}