[{"id":21216,"title":"Distributed Load Balancing for Large-Scale AI Systems","permalink":"https:\/\/bschool.nus.edu.sg\/biz-events\/event\/distributed-load-balancing-for-large-scale-ai-systems\/","category":"Seminars and talks","event_dept":{"value":"analytics-operations","label":"Analytics & Operations"},"event_sec_dept":false,"event_details":{"event_start_date":"2  December  2025","event_end_date":"2  December  2025","event_start_time":"10:00 am","event_end_time":"11:45 am","event_dress_code":"NA"},"event_loc":{"eve_address_selection":"1","eve_location_1":{"eve_org":"NUS Business School","eve_build":"Mochtar Riady Building","eve_room":"BIZ1 0302","eve_add":"15 Kent Ridge Drive","eve_count":"Singapore","eve_copos":"119245","eve_map_url":"https:\/\/goo.gl\/maps\/Q1kyjwxHNE22"},"eve_location_2":{"eve_org":"Shaw Foundation Alumni House","eve_build":"","eve_room":"Clove and Lemongrass Room Level 2","eve_add":"11 Kent Ridge Drive","eve_count":"Singapore","eve_copos":119244,"eve_map_url":"https:\/\/goo.gl\/maps\/docgThkDWFxKdb9c7"},"eve_location_3":{"eve_org":"Hon Sui Sen Memorial Library Auditorium","eve_build":"","eve_room":"","eve_add":"1 Hon Sui Sen Drive","eve_count":"Singapore","eve_copos":117588,"eve_map_url":"https:\/\/goo.gl\/maps\/NJjWK4RMpC92"},"eve_location_4":{"eve_org":"NUSS Kent Ridge Guild House","eve_build":"","eve_room":"Dalvey Room","eve_add":"9 Kent Ridge Drive","eve_count":"Singapore","eve_copos":119241,"eve_map_url":"https:\/\/goo.gl\/maps\/nXn2Luh96pH2"},"eve_location_5":{"eve_org":"Institute of Data Science","eve_build":"Innovation 4.0","eve_room":"1-3","eve_add":"3 Research Link","eve_count":"Singapore","eve_copos":117602,"eve_map_url":"https:\/\/goo.gl\/maps\/i1xocvvDh27QUXem7"},"eve_location_6":{"eve_org":"","eve_build":"","eve_room":"","eve_add":"","eve_count":"","eve_copos":"","eve_map_url":""},"eve_location_7":""},"event_introduction":"","event_short_intro":"","event_topic":null,"event_banner":false,"event_external_url":"","event_registration_details":{"event_registration_form":false,"event_registration_message":"","event_registration_deadline":null,"eve_registration_url":"","event_form":"","event_registration_ack":""},"event_speaker":[{"event_speaker_name":"Wenxin Zhang","event_speaker_designation":"","event_speaker_affiliation":"Columbia Business School","event_speaker_picture":false,"event_speaker_url":"","event_speaker_introduction":"<p>Wenxin Zhang is a PhD candidate in the Decision, Risk, and Operations Division at Columbia Business School. Her research focuses on dynamic resource allocation in large-scale service systems, with emphasis on improving the efficiency of modern AI and large language model serving. She bridges theory and practice through collaborations with Google Research and ParkHub. Her work has been published in Operations Research, ACM EC, and NeurIPS, and was recognized as a finalist for the 2025 Applied Probability Society Best Student Paper Competition. She received her B.E. in Industrial Engineering from Tsinghua University.<\/p>\n"}],"event_agenda":false,"event_photo_gallery":false,"event_presentations":false,"event_custom_heading":[{"event_custom_title":"Abstract","event_custom_details":"<p>Modern AI services rely on vast, expensive computational resources and processing power, all sustained by a global network of data centers. A key operational challenge is ensuring these systems remain responsive by efficiently routing requests. This talk introduces the Greatest Marginal Service Rate (GMSR) policy, a novel policy that routes each request to the data center where it will have the highest marginal impact on the current service rate. GMSR is designed to be fully distributed, with routers in different geographic regions making decisions independently using only local information. This design makes the system scalable and resilient while eliminating the need for complex coordination. We prove that, despite its distributed design, the GMSR policy converges to the globally optimal solution that a central coordinator would choose to minimize system-wide latency. Furthermore, the policy is robust: even when the system is overloaded, it maximizes throughput and minimizes latency for all completed requests. This work provides a practical and provably optimal load balancer for building the next generation of scalable and responsive AI systems.<\/p>\n"}],"event_enquiry_details":{"event_enq_full_name":"","event_enq_department":"","event_enq_email":"","event_enq_telephone":"","event_enq_website":""}}]