{"id":"https://openalex.org/W4405783312","doi":"https://doi.org/10.48550/arxiv.2412.17787","title":"Cross-Lingual Text-Rich Visual Comprehension: An Information Theory Perspective","display_name":"Cross-Lingual Text-Rich Visual Comprehension: An Information Theory Perspective","publication_year":2024,"publication_date":"2024-12-23","ids":{"openalex":"https://openalex.org/W4405783312","doi":"https://doi.org/10.48550/arxiv.2412.17787"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2412.17787","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2412.17787","pdf_url":"https://arxiv.org/pdf/2412.17787","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-sa","license_id":"https://openalex.org/licenses/cc-by-nc-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2412.17787","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5044347784","display_name":"Xinmiao Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yu, Xinmiao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073213620","display_name":"Xiaocheng Feng","orcid":"https://orcid.org/0000-0001-6011-0496"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Feng, Xiaocheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100712899","display_name":"Yun Li","orcid":"https://orcid.org/0000-0003-2927-2985"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030174783","display_name":"Minghui Liao","orcid":"https://orcid.org/0000-0002-2583-4314"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liao, Minghui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100544606","display_name":"Yaqi Yu","orcid":"https://orcid.org/0009-0006-2700-8389"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Ya-Qi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066977101","display_name":"Xiachong Feng","orcid":"https://orcid.org/0000-0002-4761-7484"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Feng, Xiachong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085410631","display_name":"Weihong Zhong","orcid":"https://orcid.org/0000-0002-5673-2222"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhong, Weihong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069465471","display_name":"Ruihan Chen","orcid":"https://orcid.org/0000-0003-4095-5533"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Ruihan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081386315","display_name":"Mengkang Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Mengkang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112882104","display_name":"Jihao Wu","orcid":"https://orcid.org/0000-0003-3235-0446"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Jihao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101645793","display_name":"Dandan Tu","orcid":"https://orcid.org/0000-0002-3560-124X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tu, Dandan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110628758","display_name":"Duyu Tang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tang, Duyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5017671620","display_name":"Bing Qin","orcid":"https://orcid.org/0000-0002-2543-5604"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qin, Bing","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":13,"corresponding_author_ids":["https://openalex.org/A5044347784"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11516","display_name":"Visual and Cognitive Learning Processes","score":0.8215000033378601,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11516","display_name":"Visual and Cognitive Learning Processes","score":0.8215000033378601,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T13083","display_name":"Advanced Text Analysis Techniques","score":0.7868000268936157,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/perspective","display_name":"Perspective (graphical)","score":0.8009055852890015},{"id":"https://openalex.org/keywords/comprehension","display_name":"Comprehension","score":0.6520057916641235},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.526344895362854},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.39935335516929626},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.35760563611984253},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.2993931770324707},{"id":"https://openalex.org/keywords/philosophy","display_name":"Philosophy","score":0.08963045477867126},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.0794077217578888}],"concepts":[{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.8009055852890015},{"id":"https://openalex.org/C511192102","wikidata":"https://www.wikidata.org/wiki/Q5156948","display_name":"Comprehension","level":2,"score":0.6520057916641235},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.526344895362854},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.39935335516929626},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.35760563611984253},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2993931770324707},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.08963045477867126},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0794077217578888}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2412.17787","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2412.17787","pdf_url":"https://arxiv.org/pdf/2412.17787","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-sa","license_id":"https://openalex.org/licenses/cc-by-nc-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2412.17787","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2412.17787","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2412.17787","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2412.17787","pdf_url":"https://arxiv.org/pdf/2412.17787","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-sa","license_id":"https://openalex.org/licenses/cc-by-nc-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W4382199040","https://openalex.org/W2075386565"],"abstract_inverted_index":{"Recent":[0],"Large":[1],"Vision-Language":[2,163],"Models":[3],"(LVLMs)":[4],"have":[5],"shown":[6],"promising":[7],"reasoning":[8],"capabilities":[9,227],"on":[10,40,117,210,233],"text-rich":[11,42,88],"images":[12,24],"from":[13,52,144,191],"charts,":[14],"tables,":[15],"and":[16,82,91,104,182],"documents.":[17],"However,":[18],"the":[19,27,34,46,49,53,56,179,211,218,225,234],"abundant":[20],"text":[21,81],"within":[22],"such":[23],"may":[25],"increase":[26],"model's":[28,180],"sensitivity":[29],"to":[30,36,72,148,193],"language.":[31],"This":[32,185],"raises":[33],"need":[35],"evaluate":[37],"LVLM":[38],"performance":[39,124,141,221],"cross-lingual":[41,126,145,170,194,220],"visual":[43,107,152,183],"inputs,":[44],"where":[45,167,200],"language":[47,54,77,110],"in":[48,123],"image":[50,80],"differs":[51],"of":[55,106,114,162,228],"instructions.":[57],"To":[58,154],"address":[59],"this,":[60],"we":[61,158],"introduce":[62],"XT-VQA":[63,84,118,212],"(Cross-Lingual":[64],"Text-Rich":[65],"Visual":[66],"Question":[67],"Answering),":[68],"a":[69,92,120,168,206],"benchmark":[70],"designed":[71],"assess":[73],"how":[74],"LVLMs":[75,116],"handle":[76],"inconsistency":[78],"between":[79,178],"questions.":[83],"integrates":[85],"five":[86],"existing":[87],"VQA":[89],"datasets":[90],"newly":[93],"collected":[94],"dataset,":[95],"XPaperQA,":[96],"covering":[97],"diverse":[98],"scenarios":[99],"that":[100,139,214],"require":[101],"faithful":[102],"recognition":[103],"comprehension":[105],"information":[108,136,177],"despite":[109],"inconsistency.":[111],"Our":[112],"evaluation":[113],"prominent":[115],"reveals":[119],"significant":[121],"drop":[122],"for":[125,129,237],"scenarios,":[127],"even":[128],"models":[130],"with":[131],"multilingual":[132],"capabilities.":[133],"A":[134],"mutual":[135,176],"analysis":[137],"suggests":[138],"this":[140,156],"gap":[142],"stems":[143],"questions":[146],"failing":[147],"adequately":[149],"activate":[150],"relevant":[151],"information.":[153,184],"mitigate":[155],"issue,":[157],"propose":[159],"MVCL-MI":[160,215],"(Maximization":[161],"Cross-Lingual":[164],"Mutual":[165],"Information),":[166],"visual-text":[169,219],"alignment":[171],"is":[172,186],"built":[173],"by":[174,188],"maximizing":[175],"outputs":[181],"achieved":[187],"distilling":[189],"knowledge":[190],"monolingual":[192,201],"settings":[195],"through":[196],"KL":[197],"divergence":[198],"minimization,":[199],"output":[202],"logits":[203],"serve":[204],"as":[205],"teacher.":[207],"Experimental":[208],"results":[209],"demonstrate":[213],"effectively":[216],"reduces":[217],"disparity":[222],"while":[223],"preserving":[224],"inherent":[226],"LVLMs,":[229],"shedding":[230],"new":[231],"light":[232],"potential":[235],"practice":[236],"improving":[238],"LVLMs.":[239],"Codes":[240],"are":[241],"available":[242],"at:":[243],"https://github.com/Stardust-y/XTVQA.git":[244]},"counts_by_year":[],"updated_date":"2026-02-09T09:26:11.010843","created_date":"2024-12-26T00:00:00"}
