I am Songyang Zhang (张宋扬 - in Chinese, Pronounce). I am an Applied Scientist at Amazon AGI, building video generative models. I received Ph.D. from University of Rochester, advised by Prof. Jiebo Luo. Before that, I got my master’s degree from Zhejiang University advised by Prof. Jun Xiao and my bachelor’s degree from Southeast University. My research is on computer vision and natural language processing, especially the intersection between video and language.
@article{an2023latent,title={Latent-Shift: Latent Diffusion with Temporal Shift for Efficient Text-to-Video Generation},author={An, Jie and Zhang, Songyang and Yang, Harry and Gupta, Sonal and Huang, Jia-Bin and Luo, Jiebo and Yin, Xi},journal={arXiv preprint arXiv:2304.08477},year={2023}}
EMNLP
Learning a Grammar Inducer by Watching Millions of Instructional YouTube Videos
Songyang Zhang, Linfeng Song, Lifeng Jin, Haitao Mi, Kun Xu, Dong Yu, and Jiebo Luo
In Conference on Empirical Methods in Natural Language Processing, 2022
@inproceedings{zhang2022training,oral={ture},talk={www.youtube.com/watch?v=7caDMC24oro},title={Learning a Grammar Inducer by Watching Millions of Instructional YouTube Videos},author={Zhang, Songyang and Song, Linfeng and Jin, Lifeng and Mi, Haitao and Xu, Kun and Yu, Dong and Luo, Jiebo},booktitle={Conference on Empirical Methods in Natural Language Processing},year={2022}}
ICLR
Make-A-Video: Text-to-video Generation without Text-Video Data
Uriel Singer, Adam Polyak, Thomas Hayes, Xi Yin, Jie An, Songyang Zhang, Qiyuan Hu, Harry Yang, Oron Ashual, Oran Gafni, Devi Parikh, Sonal Gupta, and Yaniv Taigman
In International Conference on Learning Representations, 2022
@inproceedings{singer2022make,news={https://ai.facebook.com/blog/generative-ai-text-to-video/},title={Make-A-Video: Text-to-video Generation without Text-Video Data},author={Singer, Uriel and Polyak, Adam and Hayes, Thomas and Yin, Xi and An, Jie and Zhang, Songyang and Hu, Qiyuan and Yang, Harry and Ashual, Oron and Gafni, Oran and Parikh, Devi and Gupta, Sonal and Taigman, Yaniv},booktitle={International Conference on Learning Representations},year={2022}}
ECCV
MUGEN: A Playground for Video-Audio-Text Multimodal Understanding and GENeration
Thomas Hayes*, Songyang Zhang*, Xi Yin, Guan Pang, Sasha Sheng, Harry Yang, Songwei Ge, Qiyuan Hu, and Devi Parikh
@inproceedings{hayes2022mugen,news={https://ai.facebook.com/blog/introducing-mugen-a-new-dataset-for-multimodal-research/},talk={https://youtu.be/it0r6Q9a1jY},title={MUGEN: A Playground for Video-Audio-Text Multimodal Understanding and GENeration},author={Hayes, Thomas and Zhang, Songyang and Yin, Xi and Pang, Guan and Sheng, Sasha and Yang, Harry and Ge, Songwei and Hu, Qiyuan and Parikh, Devi},booktitle={European Conference on Computer Vision},year={2022}}
ECCV
Expanding Language-Image Pretrained Models for General Video Recognition
@inproceedings{ni2022expanding,oral={true},title={Expanding Language-Image Pretrained Models for General Video Recognition},author={Ni, Bolin and Peng, Houwen and Chen, Minghao and Zhang, Songyang and Meng, Gaofeng and Fu, Jianlong and Xiang, Shiming and Ling, Haibin},booktitle={European Conference on Computer Vision},year={2022}}
ICCV
SAT: 2D Semantics Assisted Training for 3D Visual Grounding
Zhengyuan Yang, Songyang Zhang, Liwei Wang, and Jiebo Luo
In IEEE International Conference on Computer Vision, 2021
@inproceedings{yang2021sau,oral={true},title={SAT: 2D Semantics Assisted Training for 3D Visual Grounding},author={Yang, Zhengyuan and Zhang, Songyang and Wang, Liwei and Luo, Jiebo},booktitle={IEEE International Conference on Computer Vision},year={2021}}
NAACL
Video-aided Unsupervised Grammar Induction
Songyang Zhang, Linfeng Song, Lifeng Jin, Kun Xu, Dong Yu, and Jiebo Luo
In Conference of the North American Chapter of the Association for Computational Linguistics, 2021
@inproceedings{zhang2021video,talk={https://underline.io/lecture/19921-video-aided-unsupervised-grammar-induction},news={https://mp.weixin.qq.com/s/bzh7lbcEzfwzRsDmOA1GsQ},author={Zhang, Songyang and Song, Linfeng and Jin, Lifeng and Xu, Kun and Yu, Dong and Luo, Jiebo},title={Video-aided Unsupervised Grammar Induction},booktitle={Conference of the North American Chapter of the Association for Computational Linguistics},year={2021}}
TPAMI
Multi-Scale 2D Temporal Adjacency Networks for Moment Localization with Natural Language
Songyang Zhang, Houwen Peng, Jianlong Fu, Yijuan Lu, and Jiebo Luo
IEEE Transactions on Pattern Analysis and Machine Intelligence, 2021
@article{zhang2020multi,title={Multi-Scale 2D Temporal Adjacency Networks for Moment Localization with Natural Language},author={Zhang, Songyang and Peng, Houwen and Fu, Jianlong and Lu, Yijuan and Luo, Jiebo},journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},year={2021}}
arXiv
Learning Sparse 2D Temporal Adjacent Networks for Temporal Action Localization
Songyang Zhang, Houwen Peng, Le Yang, Jianlong Fu, and Jiebo Luo
Winner of HACS Temporal Action Localization Challenge at ICCV 2019
@article{zhang2019learning,title={Learning Sparse 2D Temporal Adjacent Networks for Temporal Action Localization},author={Zhang, Songyang and Peng, Houwen and Yang, Le and Fu, Jianlong and Luo, Jiebo},journal={arXiv preprint arXiv:1912.03612},year={2019}}
AAAI
Learning 2D Temporal Adjacent Networks for Moment Localization with Natural Language
Songyang Zhang, Houwen Peng, Jianlong Fu, and Jiebo Luo
In the AAAI Conference on Artificial Intelligence, 2020
@inproceedings{zhang2020learning,news={https://zhuanlan.zhihu.com/p/269968876},author={Zhang, Songyang and Peng, Houwen and Fu, Jianlong and Luo, Jiebo},title={Learning 2D Temporal Adjacent Networks for Moment Localization with Natural Language},booktitle={the AAAI Conference on Artificial Intelligence},year={2020}}
ACMMM
Exploiting Temporal Relationships in Video Moment Localization with Natural Language
Songyang Zhang, Jinsong Su, and Jiebo Luo
In ACM International Conference on Multimedia, 2019
@inproceedings{zhang2019exploiting,title={Exploiting Temporal Relationships in Video Moment Localization with Natural Language},author={Zhang, Songyang and Su, Jinsong and Luo, Jiebo},booktitle={ACM International Conference on Multimedia},year={2019}}
TMM
Fusing Geometric Features for Skeleton-Based Action Recognition Using Multilayer LSTM Networks
Songyang Zhang, Yang Yang, Jun Xiao, Xiaoming Liu, Yi Yang, Di Xie, and Yueting Zhuang
@article{zhang2018fusing,title={Fusing Geometric Features for Skeleton-Based Action Recognition Using Multilayer LSTM Networks},author={Zhang, Songyang and Yang, Yang and Xiao, Jun and Liu, Xiaoming and Yang, Yi and Xie, Di and Zhuang, Yueting},journal={IEEE Transactions on Multimedia},year={2018}}
WACV
On Geometric Features for Skeleton-Based Action Recognition Using Multilayer LSTM Networks
Songyang Zhang, Xiaoming Liu, and Jun Xiao
In IEEE Winter Conference on Applications of Computer Vision, 2017
@inproceedings{zhang2017geometric,title={On Geometric Features for Skeleton-Based Action Recognition Using Multilayer LSTM Networks},author={Zhang, Songyang and Liu, Xiaoming and Xiao, Jun},booktitle={IEEE Winter Conference on Applications of Computer Vision},year={2017}}