{"id":"https://openalex.org/W2751353152","doi":"https://doi.org/10.1109/icpp.2017.52","title":"Optimizations of Two Compute-Bound Scientific Kernels on the SW26010 Many-Core Processor","display_name":"Optimizations of Two Compute-Bound Scientific Kernels on the SW26010 Many-Core Processor","publication_year":2017,"publication_date":"2017-08-01","ids":{"openalex":"https://openalex.org/W2751353152","doi":"https://doi.org/10.1109/icpp.2017.52","mag":"2751353152"},"language":"en","primary_location":{"id":"doi:10.1109/icpp.2017.52","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icpp.2017.52","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2017 46th International Conference on Parallel Processing (ICPP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5062471441","display_name":"James Lin","orcid":"https://orcid.org/0000-0003-4404-5027"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]},{"id":"https://openalex.org/I114531698","display_name":"Tokyo Institute of Technology","ror":"https://ror.org/0112mx960","country_code":"JP","type":"education","lineage":["https://openalex.org/I114531698"]}],"countries":["CN","JP"],"is_corresponding":true,"raw_author_name":"James Lin","raw_affiliation_strings":["Shanghai Jiao Tong University, China","Tokyo Institute of Technology, Japan"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, China","institution_ids":["https://openalex.org/I183067930"]},{"raw_affiliation_string":"Tokyo Institute of Technology, Japan","institution_ids":["https://openalex.org/I114531698"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045564208","display_name":"Zhigeng Xu","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhigeng Xu","raw_affiliation_strings":["Shanghai Jiao Tong University, China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006972521","display_name":"Akira Nukada","orcid":"https://orcid.org/0000-0001-7959-6975"},"institutions":[{"id":"https://openalex.org/I114531698","display_name":"Tokyo Institute of Technology","ror":"https://ror.org/0112mx960","country_code":"JP","type":"education","lineage":["https://openalex.org/I114531698"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Akira Nukada","raw_affiliation_strings":["Tokyo Institute of Technology, Japan"],"affiliations":[{"raw_affiliation_string":"Tokyo Institute of Technology, Japan","institution_ids":["https://openalex.org/I114531698"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035025604","display_name":"Naoya Maruyama","orcid":null},"institutions":[{"id":"https://openalex.org/I4210129730","display_name":"RIKEN Center for Computational Science","ror":"https://ror.org/03r519674","country_code":"JP","type":"facility","lineage":["https://openalex.org/I4210110652","https://openalex.org/I4210129730"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Naoya Maruyama","raw_affiliation_strings":["RIKEN Advanced Institute for Computational Science"],"affiliations":[{"raw_affiliation_string":"RIKEN Advanced Institute for Computational Science","institution_ids":["https://openalex.org/I4210129730"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100634486","display_name":"Satoshi Matsuoka","orcid":"https://orcid.org/0000-0003-1910-8532"},"institutions":[{"id":"https://openalex.org/I114531698","display_name":"Tokyo Institute of Technology","ror":"https://ror.org/0112mx960","country_code":"JP","type":"education","lineage":["https://openalex.org/I114531698"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Satoshi Matsuoka","raw_affiliation_strings":["Tokyo Institute of Technology, Japan"],"affiliations":[{"raw_affiliation_string":"Tokyo Institute of Technology, Japan","institution_ids":["https://openalex.org/I114531698"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5062471441"],"corresponding_institution_ids":["https://openalex.org/I114531698","https://openalex.org/I183067930"],"apc_list":null,"apc_paid":null,"fwci":2.7792,"has_fulltext":false,"cited_by_count":20,"citation_normalized_percentile":{"value":0.92118908,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"432","last_page":"441"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10829","display_name":"Interconnection Networks and Systems","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9977999925613403,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8175668120384216},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.6402444839477539},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.5278239846229553},{"id":"https://openalex.org/keywords/programming-paradigm","display_name":"Programming paradigm","score":0.5132403373718262},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.49848008155822754},{"id":"https://openalex.org/keywords/memory-bandwidth","display_name":"Memory bandwidth","score":0.44451671838760376},{"id":"https://openalex.org/keywords/computer-architecture","display_name":"Computer architecture","score":0.3236572742462158},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.12140977382659912}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8175668120384216},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.6402444839477539},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.5278239846229553},{"id":"https://openalex.org/C34165917","wikidata":"https://www.wikidata.org/wiki/Q188267","display_name":"Programming paradigm","level":2,"score":0.5132403373718262},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.49848008155822754},{"id":"https://openalex.org/C188045654","wikidata":"https://www.wikidata.org/wiki/Q17148339","display_name":"Memory bandwidth","level":2,"score":0.44451671838760376},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.3236572742462158},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.12140977382659912},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/icpp.2017.52","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icpp.2017.52","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2017 46th International Conference on Parallel Processing (ICPP)","raw_type":"proceedings-article"},{"id":"pmh:oai:t2r2.star.titech.ac.jp:50466522","is_oa":false,"landing_page_url":"http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100791196","pdf_url":null,"source":{"id":"https://openalex.org/S4377196385","display_name":"Tokyo Tech Research Repository (Tokyo Institute of Technology)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I114531698","host_organization_name":"Tokyo Institute of Technology","host_organization_lineage":["https://openalex.org/I114531698"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"Conference Paper"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320334764","display_name":"Japan Society for the Promotion of Science","ror":"https://ror.org/00hhkn466"},{"id":"https://openalex.org/F4320335777","display_name":"National Key Research and Development Program of China","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W610778723","https://openalex.org/W996105196","https://openalex.org/W1977230955","https://openalex.org/W2002257715","https://openalex.org/W2002555321","https://openalex.org/W2006738855","https://openalex.org/W2073061372","https://openalex.org/W2074833026","https://openalex.org/W2090593986","https://openalex.org/W2097572763","https://openalex.org/W2103817093","https://openalex.org/W2170996201","https://openalex.org/W2171296521","https://openalex.org/W2475126267","https://openalex.org/W2561247022","https://openalex.org/W2561515725","https://openalex.org/W2562913534","https://openalex.org/W2565616222","https://openalex.org/W2726914369","https://openalex.org/W2743163708","https://openalex.org/W2747036920","https://openalex.org/W2749559108","https://openalex.org/W2751354592","https://openalex.org/W2893613126","https://openalex.org/W6740435300","https://openalex.org/W6755065348"],"related_works":["https://openalex.org/W1997955449","https://openalex.org/W3089579782","https://openalex.org/W2111209735","https://openalex.org/W2117014006","https://openalex.org/W1500804266","https://openalex.org/W2370911386","https://openalex.org/W2358725432","https://openalex.org/W1582519588","https://openalex.org/W2354106728","https://openalex.org/W2096765878"],"abstract_inverted_index":{"The":[0,20,106,175,233],"home-grown":[1,173],"SW26010":[2,31,252],"many-core":[3],"processor":[4,42],"enabled":[5],"the":[6,17,23,30,41,82,102,124,130,139,152,158,161,165,172,185,193,197,211,215,231,241,246,251],"production":[7],"of":[8,22,132,151,163,203],"China's":[9],"first":[10,107,159],"independently":[11],"developed":[12],"number-one":[13],"ranked":[14],"supercomputer":[15],"-":[16],"Sunway":[18],"TaihuLight.":[19],"design":[21,255],"limited":[24],"off-chip":[25],"memory":[26],"bandwidth,":[27],"however,":[28],"renders":[29],"a":[32,46,68,73,206,226],"highly":[33,166],"memory-bound":[34],"processor.":[35,174],"To":[36,88],"compensate":[37],"for":[38,119,157,189,230],"this":[39,85,120],"limitation,":[40],"was":[43,117],"designed":[44,118],"with":[45,205,236],"unique":[47],"hardware":[48,187],"feature,":[49],"\"Register":[50],"Level":[51],"Communication\"":[52],"(RLC),":[53],"to":[54,100,122,141,144,184,209,254,259],"share":[55],"register":[56],"data":[57,125],"among":[58],"its":[59],"8":[60,62],"\u00d7":[61],"computing":[63],"processing":[64],"elements":[65],"(CPEs)":[66],"via":[67],"2D":[69],"onchip":[70],"network.":[71],"Such":[72],"radical":[74],"architecture":[75],"has":[76],"sparked":[77],"global":[78],"researchers'":[79],"concerns":[80],"regarding":[81],"programming":[83,104,216,228],"challenges":[84,217],"may":[86],"cause.":[87],"address":[89],"these":[90,220],"concerns,":[91],"we":[92,179,195,224],"adopted":[93],"two":[94,221],"compute-bound":[95],"scientific":[96],"kernels":[97],"as":[98],"benchmarks":[99],"identify":[101],"potential":[103],"challenges.":[105],"kernel":[108,121,140,177],"is":[109,169,181,253],"doubleprecision":[110],"general":[111],"matrix-multiplication":[112],"(DGEMM).":[113],"An":[114],"RLCfriendly":[115],"algorithm":[116,258],"reuse":[123],"that":[126,178,240],"already":[127],"reside":[128],"in":[129,147],"registers":[131],"64":[133],"CPEs.":[134],"This":[135,154],"novel":[136],"optimization":[137],"enables":[138],"achieve":[142],"up":[143],"88.7%":[145],"efficiency":[146],"one":[148],"core":[149],"group":[150],"SW26010.":[153,232],"paper":[155,234],"reveals,":[156],"time,":[160],"details":[162],"how":[164],"efficient":[167],"DGEMM":[168],"implemented":[170],"on":[171,192,214,250],"second":[176],"used":[180],"N-body.":[182],"Due":[183],"inefficient":[186],"support":[188],"transcendental":[190],"operations":[191],"SW26010,":[194],"replaced":[196],"reciprocal":[198],"square":[199],"root":[200],"(rsqrt)":[201],"instruction":[202],"N-body":[204],"software":[207],"routine":[208],"tackle":[210],"problem.":[212],"Based":[213],"identified":[218],"through":[219],"optimized":[222],"kernels,":[223],"proposed":[225],"three-level":[227],"guideline":[229],"concludes":[235],"our":[237],"crucial":[238],"finding":[239],"critical":[242],"step":[243],"towards":[244],"bridging":[245],"ninja":[247],"performance":[248],"gap":[249],"an":[256],"RLC-friendly":[257],"increase":[260],"arithmetic":[261],"intensity.":[262]},"counts_by_year":[{"year":2024,"cited_by_count":2},{"year":2022,"cited_by_count":2},{"year":2021,"cited_by_count":4},{"year":2020,"cited_by_count":4},{"year":2019,"cited_by_count":4},{"year":2018,"cited_by_count":4}],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2025-10-10T00:00:00"}
