{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,15]],"date-time":"2026-04-15T17:53:39Z","timestamp":1776275619729,"version":"3.50.1"},"reference-count":74,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,6,1]],"date-time":"2023-06-01T00:00:00Z","timestamp":1685577600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,6,1]],"date-time":"2023-06-01T00:00:00Z","timestamp":1685577600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"NSF of China","doi-asserted-by":"publisher","award":["62176155"],"award-info":[{"award-number":["62176155"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100007847","name":"Jilin Province","doi-asserted-by":"publisher","award":["20200201037JC"],"award-info":[{"award-number":["20200201037JC"]}],"id":[{"id":"10.13039\/100007847","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,6]]},"DOI":"10.1109\/cvpr52729.2023.01415","type":"proceedings-article","created":{"date-parts":[[2023,8,22]],"date-time":"2023-08-22T17:30:52Z","timestamp":1692725452000},"page":"14730-14740","source":"Crossref","is-referenced-by-count":245,"title":["Generating Human Motion from Textual Descriptions with Discrete Representations"],"prefix":"10.1109","author":[{"given":"Jianrong","family":"Zhang","sequence":"first","affiliation":[{"name":"Jilin University"}]},{"given":"Yangsong","family":"Zhang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}]},{"given":"Xiaodong","family":"Cun","sequence":"additional","affiliation":[{"name":"Tencent AI Lab"}]},{"given":"Yong","family":"Zhang","sequence":"additional","affiliation":[{"name":"Tencent AI Lab"}]},{"given":"Hongwei","family":"Zhao","sequence":"additional","affiliation":[{"name":"Jilin University"}]},{"given":"Hongtao","family":"Lu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}]},{"given":"Xi","family":"Shen","sequence":"additional","affiliation":[{"name":"Tencent AI Lab"}]},{"given":"Shan","family":"Ying","sequence":"additional","affiliation":[{"name":"Tencent AI Lab"}]}],"member":"263","reference":[{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/3450626.3459932"},{"key":"ref57","author":"ramesh","year":"2022","journal-title":"Hierarchical text-conditional image generation with CLIP latents"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.173"},{"key":"ref56","article-title":"Improving language understanding by generative pre-training","author":"radford","year":"0","journal-title":"Advances in Neural IInformation Processing Systems"},{"key":"ref15","author":"dhariwal","year":"2020","journal-title":"Jukebox A generative model for music"},{"key":"ref59","article-title":"Generating diverse high-fidelity images with vq-vae-2","author":"razavi","year":"0","journal-title":"Advances in Neural IInformation Processing Systems"},{"key":"ref14","article-title":"Vqgan-clip: Open domain image generation and editing with natural language guidance","author":"crowson","year":"0","journal-title":"Proceedings of the European Conference on Computer Vision (ECCV)"},{"key":"ref58","article-title":"Zero-shot text-to-image generation","author":"ramesh","year":"0","journal-title":"International Conference on Machine Learning (ICML)"},{"key":"ref53","article-title":"TEMOS: Generating diverse human motions from textual descriptions","author":"petrovich","year":"0","journal-title":"Proceedings of the European Conference on Computer Vision (ECCV)"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01080"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/111"},{"key":"ref55","article-title":"Learning transferable visual models from natural language supervision","author":"radford","year":"0","journal-title":"International Conference on Machine Learning (ICML)"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/VR50410.2021.00037"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1089\/big.2016.0028"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00054"},{"key":"ref16","article-title":"The challenge of realistic music generation: modelling raw audio at scale","author":"dieleman","year":"0","journal-title":"Advances in Neural IInformation Processing Systems"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"ref18","author":"duan","year":"2021","journal-title":"Single-shot motion completion with transformer"},{"key":"ref51","article-title":"Quaternet: A quaternion-based recurrent model for human motion","author":"pavllo","year":"0","journal-title":"Proceedings of the British Machine Vision Conference (BMVC)"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.497"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00554"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20068-7_24"},{"key":"ref48","article-title":"History repeats itself: Human motion prediction via motion attention","author":"mao","year":"0","journal-title":"Proceedings of the European Conference on Computer Vision (ECCV)"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICAR.2015.7251476"},{"key":"ref42","article-title":"Generating animated videos of human activities from natural language descriptions","author":"lin","year":"0","journal-title":"Advances in Neural IInformation Processing Systems"},{"key":"ref41","article-title":"Bailando: 3d dance generation by actor-critic gpt with choreographic memory","author":"li","year":"0","journal-title":"Proceedings of the Conference on Computer Vision and Pattern Recognition (CVPR)"},{"key":"ref44","article-title":"Decoupled weight decay regularization","author":"loshchilov","year":"0","journal-title":"International Conference on Learning Representations (ICLR)"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1145\/2816795.2818013"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00958"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1093\/oso\/9780195073591.001.0001"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/3DV57658.2022.00053"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2018.00191"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00527"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/3DV.2019.00084"},{"key":"ref6","author":"aristidou","year":"2021","journal-title":"Rhythm is a dancer Music-driven motion synthesis with global structure"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/3550454.3555435"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01315"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/3DV50981.2020.00102"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1145\/2897824.2925975"},{"key":"ref37","article-title":"Dancing to music","author":"lee","year":"0","journal-title":"Advances in Neural IInformation Processing Systems"},{"key":"ref36","article-title":"Auto-encoding variational bayes","author":"kingma","year":"0","journal-title":"International Conference on Learning Representations (ICLR)"},{"key":"ref31","article-title":"Gans trained by a two time-scale update rule converge to a local nash equilibrium","author":"heusel","year":"0","journal-title":"Advances in Neural IInformation Processing Systems"},{"key":"ref30","article-title":"Human motion prediction via spatio-temporal in-painting","author":"hernandez","year":"0","journal-title":"Proceedings of the Conference on Computer Vision and Pattern Recognition (CVPR)"},{"key":"ref74","author":"zhang","year":"2022","journal-title":"Motiondiffuse Text-driven human motion generation with diffusion model"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073663"},{"key":"ref32","article-title":"Denoising diffusion probabilistic models","author":"ho","year":"0","journal-title":"Advances in Neural IInformation Processing Systems"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2018.8460608"},{"key":"ref1","year":"0","journal-title":"CMU Graphics Lab Motion Capture Database"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/3528223.3530157"},{"key":"ref38","author":"li","year":"2020","journal-title":"Learning to generate diverse dance motions with transformer"},{"key":"ref71","author":"xin","year":"2022","journal-title":"Executing your commands via motion diffusion in latent space"},{"key":"ref70","article-title":"Hierarchical quantized autoencoders","author":"williams","year":"0","journal-title":"Advances in Neural IInformation Processing Systems"},{"key":"ref73","article-title":"Mt-vae: Learning motion transformations to generate multimodal human dynamics","author":"yan","year":"0","journal-title":"Proceedings of the European Conference on Computer Vision (ECCV)"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00449"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413635"},{"key":"ref68","article-title":"Neural discrete representation learning","author":"van den oord","year":"0","journal-title":"Advances in Neural IInformation Processing Systems"},{"key":"ref23","article-title":"Tm2t: Stochastic and tokenized modeling for the reciprocal generation of 3d human motions and texts","author":"guo","year":"0","journal-title":"Proceedings of the European Conference on Computer Vision (ECCV)"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00165"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.5244\/C.31.119"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00479"},{"key":"ref69","article-title":"Attention is all you need","author":"vaswani","year":"0","journal-title":"Advances in Neural IInformation Processing Systems"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.494"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1145\/3528223.3530090"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1145\/3355089.3356505"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00509"},{"key":"ref66","author":"tevet","year":"2022","journal-title":"Human motion diffusion model"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00143"},{"key":"ref65","article-title":"Motionclip: Exposing human motion generation to clip space","author":"tevet","year":"0","journal-title":"Proceedings of the European Conference on Computer Vision (ECCV)"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/3386569.3392480"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/3283254.3283277"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1145\/3528223.3530178"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00467"}],"event":{"name":"2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","location":"Vancouver, BC, Canada","start":{"date-parts":[[2023,6,17]]},"end":{"date-parts":[[2023,6,24]]}},"container-title":["2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10203037\/10203050\/10203705.pdf?arnumber=10203705","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,9,11]],"date-time":"2023-09-11T17:52:22Z","timestamp":1694454742000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10203705\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6]]},"references-count":74,"URL":"https:\/\/doi.org\/10.1109\/cvpr52729.2023.01415","relation":{},"subject":[],"published":{"date-parts":[[2023,6]]}}}