I am a research scientist and team lead at Beijing Institute for General Artificial Intelligence (BIGAI), China. I received my Ph.D. in 2022 from University of California, Los Angeles (UCLA), advised by Professor Song-Chun Zhu. During my Ph.D., I have interned at Google Research, Microsoft Azure AI and Amazon Alexa. Before UCLA, I obtained my degrees of Bachelor in 2015 and Master in 2018 from University of Science and Technology of China (USTC).
Currently, I focus on developing language agents for general computer use via agentic RL.
@article{li2025dart,title={Efficient Multi-turn RL for GUI Agents via Decoupled Training and Adaptive Data Curation},author={Li, Pengxiang and Hu, Zechen and Shang, Zirui and Wu, Jingrong and Liu, Yang and Liu, Hui and Gao, Zhi and Shi, Chenrui and Zhang, Bofei and Zhang, Zihao and Shi, Xiaochuan and YU, Zedong and Wu, Yuwei and Wu, Xinxiao and Jia, Yunde and Xiang, Liuyu and He, Zhaofeng and Li, Qing},journal={arXiv preprint arXiv:2509.23866},year={2025},}
TongUI: Building Generalized GUI Agents by Learning from Multimodal Web Tutorials
Bofei Zhang , Zirui Shang , Zhi Gao , Wang Zhang , Rui Xie , Xiaojian Ma , Tao Yuan , Xinxiao Wu , Song-Chun Zhu , and Qing Li✉
@article{zhang2025tongui,title={TongUI: Building Generalized GUI Agents by Learning from Multimodal Web Tutorials},author={Zhang, Bofei and Shang, Zirui and Gao, Zhi and Zhang, Wang and Xie, Rui and Ma, Xiaojian and Yuan, Tao and Wu, Xinxiao and Zhu, Song-Chun and Li, Qing},journal={arXiv preprint arXiv:2504.12679},year={2025},}
Iterative Tool Usage Exploration for Multimodal Agents via Step-wise Preference Tuning
@article{li2025sport,title={Iterative Tool Usage Exploration for Multimodal Agents via Step-wise Preference Tuning},author={Li, Pengxiang and Gao, Zhi and Zhang, Bofei and Mi, Yapeng and Ma, Xiaojian and Shi, Chenrui and Yuan, Tao and Wu, Yuwei and Jia, Yunde and Zhu, Song-Chun and Li, Qing},journal={Neural Information Processing Systems (NeurIPS)},year={2025},}
From Objects to Anywhere: A Holistic Benchmark for Multi-level Visual Grounding in 3D Scenes
@article{wang2025anywhere3d,title={From Objects to Anywhere: A Holistic Benchmark for Multi-level Visual Grounding in 3D Scenes},author={Wang, Tianxu and Zhang, Zhuofan and Zhu, Ziyu and Fan, Yue and Xiong, Jing and Li, Pengxiang and Ma, Xiaojian and Li, Qing},journal={Neural Information Processing Systems: Datasets and Benchmarks (NeurIPS D&B)},year={2025},}
Move to Understand a 3D Scene: Bridging Visual Grounding and Exploration for Efficient and Versatile Embodied Navigation Highlight
@article{zhu2025mtu,title={Move to Understand a 3D Scene: Bridging Visual Grounding and Exploration for Efficient and Versatile Embodied Navigation},author={Zhu, Ziyu and Wang, Xilin and Li, Yixuan and Zhang, Zhuofan and Ma, Xiaojian and Chen, Yixin and Jia, Baoxiong and Liang, Wei and Yu, Qian and Deng, Zhidong and Huang, Siyuan and Li, Qing},journal={International Conference on Computer Vision (ICCV)},year={2025},}
Embodied VideoAgent: Persistent Memory from Egocentric Videos and Embodied Sensors Enables Dynamic Scene Understanding Highlight
Yue Fan , Xiaojian Ma , Rongpeng Su , Jun Guo , Rujie Wu , Xi Chen , and Qing Li✉
International Conference on Computer Vision (ICCV), 2025
@article{fan2025eva,title={Embodied VideoAgent: Persistent Memory from Egocentric Videos and Embodied Sensors Enables Dynamic Scene Understanding},author={Fan, Yue and Ma, Xiaojian and Su, Rongpeng and Guo, Jun and Wu, Rujie and Chen, Xi and Li, Qing},journal={International Conference on Computer Vision (ICCV)},year={2025},}
Falcon: Fast Visuomotor Policies via Partial Denoising
Haojun Chen , Minghao Liu , Chengdong Ma , Xiaojian Ma , Zailin Ma , Huimin Wu , Yuanpei Chen , Yifan Zhong , Mingzhi Wang , Qing Li✉ , and Yaodong Yang✉
International Conference on Machine Learning (ICML), 2025
@article{chen2025falcon,title={Falcon: Fast Visuomotor Policies via Partial Denoising},author={Chen, Haojun and Liu, Minghao and Ma, Chengdong and Ma, Xiaojian and Ma, Zailin and Wu, Huimin and Chen, Yuanpei and Zhong, Yifan and Wang, Mingzhi and Li, Qing and Yang, Yaodong},journal={International Conference on Machine Learning (ICML)},year={2025},}
MetaScenes: Towards Automated Replica Creation for Real-world 3D Scans
@article{yu2025metascenes,title={MetaScenes: Towards Automated Replica Creation for Real-world 3D Scans},author={Yu, Huangyue and Jia, Baoxiong and Chen, Yixin and Yang, Yandan and Su, Rongpeng and Li, Jiaxin and Li, Qing and Liang, Wei and Zhu, Song-Chun and Liu, Tengyu and Huang, Siyuan},journal={The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},year={2025},}
Unveiling the Mist over 3D Vision-Language Understanding: Object-centric Evaluation with Chain-of-Analysis
@article{huang2025beacon3d,title={Unveiling the Mist over 3D Vision-Language Understanding: Object-centric Evaluation with Chain-of-Analysis},author={Huang, Jiangyong and Jia, Baoxiong and Zhu, Ziyu and Wang, Yan and Linghu, Xiongkun and Li, Qing and Zhu, Song-Chun and Huang, Siyuan},journal={The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},year={2025},}
Multi-modal Agent Tuning: Building a VLM-Driven Agent for Efficient Tool Usage Spotlight
@article{2025mat,title={Multi-modal Agent Tuning: Building a VLM-Driven Agent for Efficient Tool Usage},author={Gao, Zhi and Zhang, Bofei and Li, Pengxiang and Ma, Xiaojian and Yuan, Tao and Fan, Yue and Wu, Yuwei and Jia, Yunde and Zhu, Song-Chun and Li, Qing},journal={International Conference on Learning Representations (ICLR)},year={2025},}
MMKE-Bench: A Multimodal Editing Benchmark for Diverse Visual Knowledge
@article{2025mmke,title={MMKE-Bench: A Multimodal Editing Benchmark for Diverse Visual Knowledge},author={Du, Yuntao and Jiang, Kailin and Gao, Zhi and Shi, Chenrui and Zheng, Zilong and Qi, Siyuan and Li, Qing},journal={International Conference on Learning Representations (ICLR)},year={2025},}
FIRE: A Dataset for Feedback Integration and Refinement Evaluation of Multimodal Models
@article{2024fire,title={FIRE: A Dataset for Feedback Integration and Refinement Evaluation of Multimodal Models},author={Li, Pengxiang and Gao, Zhi and Zhang, Bofei and Yuan, Tao and Wu, Yuwei and Harandi, Mehrtash and Jia, Yunde and Zhu, Song-Chun and Li, Qing},journal={Neural Information Processing Systems: Datasets and Benchmarks (NeurIPS D&B)},year={2024},}
UltraEdit: Instruction-based Fine-Grained Image Editing at Scale
Haozhe Zhao* , Xiaojian Ma* , Liang Chen , Shuzheng Si , Rujie Wu , Kaikai An , Peiyu Yu , Minjia Zhang , Qing Li✉ , and Baobao Chang✉
Neural Information Processing Systems: Datasets and Benchmarks (NeurIPS D&B), 2024
@article{2024ultraedit,title={UltraEdit: Instruction-based Fine-Grained Image Editing at Scale},author={Zhao, Haozhe and Ma, Xiaojian and Chen, Liang and Si, Shuzheng and Wu, Rujie and An, Kaikai and Yu, Peiyu and Zhang, Minjia and Li, Qing and Chang, Baobao},journal={Neural Information Processing Systems: Datasets and Benchmarks (NeurIPS D&B)},year={2024},}
@article{2024sg3d,title={Task-oriented Sequential Grounding in 3D Scenes},author={Zhang, Zhuofan and Zhu, Ziyu and Li, Pengxiang and Liu, Tengyu and Ma, Xiaojian and Chen, Yixin and Jia, Baoxiong and Huang, Siyuan and Li, Qing},journal={arXiv preprint arXiv:2408.04034},year={2024},}
End-to-End Neuro-Symbolic Reinforcement Learning with Textual Explanations Spotlight (top 3.5%)
@article{luo2024insight,title={End-to-End Neuro-Symbolic Reinforcement Learning with Textual Explanations},author={Luo, Lirui and Zhang, Guoxi and Xu, Hongming and Yang, Yaodong and Fang, Cong and Li, Qing},journal={International Conference on Machine Learning (ICML)},year={2024},}
Unifying 3D Vision-Language Understanding Via Promptable Queries
@article{zhu2024unifying,title={Unifying 3D Vision-Language Understanding Via Promptable Queries},author={Zhu, Ziyu and Zhang, Zhuofan and Ma, Xiaojian and Niu, Xuesong and Chen, Yixin and Jia, Baoxiong and Deng, Zhidong and Huang, Siyuan and Li, Qing},journal={European Conference on Computer Vision (ECCV)},year={2024}}
VideoAgent: A Memory-augmented Multimodal Agent for Video Understanding
@article{fan2024videoagent,title={VideoAgent: A Memory-augmented Multimodal Agent for Video Understanding},author={Fan, Yue and Ma, Xiaojian and Wu, Rujie and Du, Yuntao and Li, Jiaqi and Gao, Zhi and Li, Qing},journal={European Conference on Computer Vision (ECCV)},year={2024}}
SceneVerse: Scaling 3D Vision-Language Learning for Grounded Scene Understanding
@article{jia2024sceneverse,title={SceneVerse: Scaling 3D Vision-Language Learning for Grounded Scene Understanding},author={Jia, Baoxiong and Chen, Yixin and Yu, Huangyue and Wang, Yan and Niu, Xuesong and Liu, Tengyu and Li, Qing and Huang, Siyuan},journal={European Conference on Computer Vision (ECCV)},year={2024}}
CLOVA: A Closed-Loop Visual Assistant with Tool Usage and Update
@article{gao2024clova,title={CLOVA: A Closed-Loop Visual Assistant with Tool Usage and Update},author={Gao, Zhi and Du, Yuntao and Zhang, Xintong and Ma, Xiaojian and Han, Wenjuan and Zhu, Song-Chun and Li, Qing},journal={The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},year={2024}}
@article{huang2024embodied,title={An Embodied Generalist Agent in 3D World},author={Huang, Jiangyong and Yong, Silong and Ma, Xiaojian and Linghu, Xiongkun and Li, Puhao and Wang, Yan and Li, Qing and Zhu, Song-Chun and Jia, Baoxiong and Huang, Siyuan},journal={International Conference on Machine Learning (ICML)},year={2024}}
Neural-Symbolic Recursive Machine for Systematic Generalization
@article{li2024nsr,title={Neural-Symbolic Recursive Machine for Systematic Generalization},author={Li, Qing and Zhu, Yixin and Liang, Yitao and Wu, Ying Nian and Zhu, Song-Chun and Huang, Siyuan},journal={International Conference on Learning Representations (ICLR)},year={2024}}
Bongard-OpenWorld: Few-Shot Reasoning for Free-Form Visual Concepts in the Real World
@article{wu2024bongard,title={Bongard-OpenWorld: Few-Shot Reasoning for Free-Form Visual Concepts in the Real World},author={Wu, Rujie and Ma, Xiaojian and Zhang, Zhenliang and Wang, Wei and Li, Qing and Zhu, Song-Chun and Wang, Yizhou},journal={International Conference on Learning Representations (ICLR)},year={2024}}
Learning Non-Markovian Decision-Making from State-Only Sequences
@article{qin2023learning,title={Learning Non-Markovian Decision-Making from State-Only Sequences},author={Qin, Aoyang and Gao, Feng and Li, Qing and Zhu, Song-Chun and Xie, Sirui},journal={Neural Information Processing Systems (NeurIPS)},year={2023}}
A Minimalist Dataset for Systematic Generalization of Perception, Syntax, and Semantics Notable-top-25%
@article{li2023hint,title={A Minimalist Dataset for Systematic Generalization of Perception, Syntax, and Semantics},author={Li, Qing and Huang, Siyuan and Hong, Yining and Zhu, Yixin and Wu, Ying Nian and Zhu, Song-Chun},journal={International Conference on Learning Representations (ICLR)},year={2023}}
3D-VisTA: Pre-Trained Transformer for 3D Vision and Text Alignment
@article{zhu2023vista,title={3D-VisTA: Pre-Trained Transformer for 3D Vision and Text Alignment},author={Zhu, Ziyu and Ma, Xiaojian and Chen, Yixin and Deng, Zhidong and Huang, Siyuan and Li, Qing},journal={International Conference on Computer Vision (ICCV)},year={2023}}
@article{ma2023sqa3d,title={SQA3D: Situated Question Answering in 3D Scenes},author={Ma, Xiaojian and Yong, Silong and Zheng, Zilong and Li, Qing and Liang, Yitao and Zhu, Song-Chun and Huang, Siyuan},journal={International Conference on Learning Representations (ICLR)},year={2023}}
SMART: A Situation Model for Algebra Story Problems via Attributed Grammar
@article{hong2021smart,title={SMART: A Situation Model for Algebra Story Problems via Attributed Grammar},author={Hong, Yining and Li, Qing and Gong, Ran and Ciao, Daniel and Huang, Siyuan and Zhu, Song-Chun},journal={AAAI Conference on Artificial Intelligence (AAAI)},year={2021}}
Learning by Fixing: Solving Math Word Problems with Weak Supervision
@article{hong2021learning,title={Learning by Fixing: Solving Math Word Problems with Weak Supervision},author={Hong, Yining and Li, Qing and Ciao, Daniel and Huang, Siyuan and Zhu, Song-Chun},journal={AAAI Conference on Artificial Intelligence (AAAI)},year={2021}}
YouRefIt: Embodied Reference Understanding with Language and Gesture Oral
@article{chen2021yourefit,title={YouRefIt: Embodied Reference Understanding with Language and Gesture},author={Chen, Yixin and Li, Qing and Kong, Deqian and Kei, Yik Lun and Zhu, Song-Chun and Gao, Tao and Zhu, Yixin and Huang, Siyuan},journal={International Conference on Computer Vision (ICCV)},year={2021}}
VLGrammar: Grounded Grammar Induction of Vision and Language
@article{hong2021vlgrammar,title={VLGrammar: Grounded Grammar Induction of Vision and Language},author={Hong, Yining and Li, Qing and Zhu, Song-Chun and Huang, Siyuan},journal={International Conference on Computer Vision (ICCV)},year={2021}}
A Competence-Aware Curriculum for Visual Concepts Learning Via Question Answering Oral
@article{li2020competence,title={A Competence-Aware Curriculum for Visual Concepts Learning Via Question Answering},author={Li, Qing and Huang, Siyuan and Hong, Yining and Zhu, Song-Chun},journal={European Conference on Computer Vision (ECCV)},year={2020}}
Closed Loop Neural-Symbolic Learning Via Integrating Neural Perception, Grammar Parsing, and Symbolic Reasoning Best Paper in ICML Workshop
@article{li2020ngs,title={Closed Loop Neural-Symbolic Learning Via Integrating Neural Perception, Grammar Parsing, and Symbolic Reasoning},author={Li, Qing and Huang, Siyuan and Hong, Yining and Chen, Yixin and Wu, Ying Nian and Zhu, Song-Chun},journal={International Conference on Machine Learning (ICML)},year={2020}}
Why Does a Visual Question Have Different Answers?
@article{bhattacharya2019visual,title={Why Does a Visual Question Have Different Answers?},author={Bhattacharya, Nilavra and Li, Qing and Gurari, Danna},journal={International Conference on Computer Vision (ICCV)},year={2019}}
VizWiz-Priv: A Dataset for Recognizing the Presence and Purpose of Private Visual Information in Images Taken by Blind People
Danna Gurari , Qing Li , Chi Lin , Yinan Zhao , Anhong Guo , Abigale Stangl , and Jeffrey P Bigham
The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2019
@article{gurari2019vizwizpriv,title={VizWiz-Priv: A Dataset for Recognizing the Presence and Purpose of Private Visual Information in Images Taken by Blind People},author={Gurari, Danna and Li, Qing and Lin, Chi and Zhao, Yinan and Guo, Anhong and Stangl, Abigale and Bigham, Jeffrey P},journal={The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},year={2019}}
Tell-and-Answer: Towards Explainable Visual Question Answering Using Attributes and Captions Oral
@article{li2018tell,title={Tell-and-Answer: Towards Explainable Visual Question Answering Using Attributes and Captions},author={Li, Qing and Fu, Jianlong and Yu, Dongfei and Mei, Tao and Luo, Jiebo},journal={Annual Conference on Empirical Methods in Natural Language Processing (EMNLP)},year={2018}}
VQA-E: Explaining, Elaborating, and Enhancing Your Answers for Visual Questions
@article{li2018vqa,title={VQA-E: Explaining, Elaborating, and Enhancing Your Answers for Visual Questions},author={Li, Qing and Tao, Qingyi and Joty, Shafiq and Cai, Jianfei and Luo, Jiebo},journal={European Conference on Computer Vision (ECCV)},year={2018}}
VizWiz Grand Challenge: Answering Visual Questions from Blind People Spotlight
Danna Gurari , Qing Li , Abigale J Stangl , Anhong Guo , Chi Lin , Kristen Grauman , Jiebo Luo , and Jeffrey P Bigham
The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2018
@article{gurari2018vizwiz,title={VizWiz Grand Challenge: Answering Visual Questions from Blind People},author={Gurari, Danna and Li, Qing and Stangl, Abigale J and Guo, Anhong and Lin, Chi and Grauman, Kristen and Luo, Jiebo and Bigham, Jeffrey P},journal={The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},year={2018}}
Action Recognition by Learning Deep Multi-Granular Spatio-Temporal Video Representation Best Paper Finalist
@article{li2016action,title={Action Recognition by Learning Deep Multi-Granular Spatio-Temporal Video Representation},author={Li, Qing and Qiu, Zhaofan and Yao, Ting and Mei, Tao and Rui, Yong and Luo, Jiebo},journal={International Conference on Multimedia Retrieval},year={2016}}