Publications

1.

Kyungmin Lee, Xiaohang Li, Qifei Wang, Junfeng He, Junjie Ke, Ming-Hsuan Yang, Irfan Essa, Jinwoo Shin, Feng Yang, Yinxiao Li

Calibrated Multi-Preference Optimization for Aligning Diffusion Models Proceedings Article

In: IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2025.

Abstract | Links | BibTeX | Tags: computer vision, CVPR, generative media, google, reinforcement learning

@inproceedings{2025-Lee-CMOADM,

title = {Calibrated Multi-Preference Optimization for Aligning Diffusion Models},

author = {Kyungmin Lee and Xiaohang Li and Qifei Wang and Junfeng He and Junjie Ke and Ming-Hsuan Yang and Irfan Essa and Jinwoo Shin and Feng Yang and Yinxiao Li

},

url = {https://cvpr.thecvf.com/virtual/2025/poster/33781

https://arxiv.org/abs/2502.02588},

doi = {10.48550/arXiv.2502.02588},

year  = {2025},

date = {2025-06-13},

urldate = {2025-06-13},

booktitle = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},

abstract = {Aligning text-to-image (T2I) diffusion models with preference optimization is valuable for human-annotated datasets, but the heavy cost of manual data collection limits scalability. Using reward models offers an alternative, however, current preference optimization methods fall short in exploiting the rich information, as they only consider pairwise preference distribution. Furthermore, they lack generalization to multi-preference scenarios and struggle to handle inconsistencies between rewards. To address this, we present Calibrated Preference Optimization (CaPO), a novel method to align T2I diffusion models by incorporating the general preference from multiple reward models without human annotated data. The core of our approach involves a reward calibration method to approximate the general preference by computing the expected win-rate against the samples generated by the pretrained models. Additionally, we propose a frontier-based pair selection method that effectively manages the multi-preference distribution by selecting pairs from Pareto frontiers. Finally, we use regression loss to fine-tune diffusion models to match the difference between calibrated rewards of a selected pair. Experimental results show that CaPO consistently outperforms prior methods, such as Direct Preference Optimization (DPO), in both single and multi-reward settings validated by evaluation on T2I benchmarks, including GenEval and T2I-Compbench.



},

keywords = {computer vision, CVPR, generative media, google, reinforcement learning},

pubstate = {published},

tppubtype = {inproceedings}

}

Close

2.

Seung Hyun Lee, Yinxiao Li, Junjie Ke, Innfarn Yoo, Han Zhang, Jiahui Yu, Qifei Wang, Fei Deng, Glenn Entis, Junfeng He, Gang Li, Sangpil Kim, Irfan Essa, Feng Yang

Parrot: Pareto-optimal multi-reward reinforcement learning framework for text-to-image generation (inproceedings) Proceedings Article

In: Proceedings of European Conference on Computer Vision (ECCV) , 2024.

Abstract | Links | BibTeX | Tags: arXiv, computer vision, ECCV, generative AI, google, reinforcement learning

@inproceedings{2024-Lee-PPMRLFTG,

title = {Parrot: Pareto-optimal multi-reward reinforcement learning framework for text-to-image generation (inproceedings)},

author = {Seung Hyun Lee and Yinxiao Li and Junjie Ke and Innfarn Yoo and Han Zhang and Jiahui Yu and Qifei Wang and Fei Deng and Glenn Entis and Junfeng He and Gang Li and Sangpil Kim and Irfan Essa and Feng Yang

},

url = {https://arxiv.org/abs/2401.05675

https://arxiv.org/pdf/2401.05675

https://dl.acm.org/doi/10.1007/978-3-031-72920-1_26},

doi = {10.48550/arXiv.2401.05675},

year  = {2024},

date = {2024-07-25},

urldate = {2024-07-25},

booktitle = {Proceedings of European Conference on Computer Vision (ECCV)

},

abstract = {Recent works have demonstrated that using reinforcement learning (RL) with multiple quality rewards can improve the quality of generated images in text-to-image (T2I) generation. However, manually adjusting reward weights poses challenges and may cause over-optimization in certain metrics. To solve this, we propose Parrot, which addresses the issue through multi-objective optimization and introduces an effective multi-reward optimization strategy to approximate Pareto optimal. Utilizing batch-wise Pareto optimal selection, Parrot automatically identifies the optimal trade-off among different rewards. We use the novel multi-reward optimization algorithm to jointly optimize the T2I model and a prompt expansion network, resulting in significant improvement of image quality and also allow to control the trade-off of different rewards using a reward related prompt during inference. Furthermore, we introduce original prompt-centered guidance at inference time, ensuring fidelity to user input after prompt expansion. Extensive experiments and a user study validate the superiority of Parrot over several baselines across various quality criteria, including aesthetics, human preference, text-image alignment, and image sentiment.

},

keywords = {arXiv, computer vision, ECCV, generative AI, google, reinforcement learning},

pubstate = {published},

tppubtype = {inproceedings}

}

Close

3.

K. Niranjan Kumar, Irfan Essa, Sehoon Ha

Cascaded Compositional Residual Learning for Complex Interactive Behaviors Journal Article

In: IEEE Robotics and Automation Letters, vol. 8, iss. 8, pp. 4601–4608, 2023.

Abstract | Links | BibTeX | Tags: IEEE, reinforcement learning, robotics

4.

Erik Wijmans, Irfan Essa, Dhruv Batra

VER: Scaling On-Policy RL Leads to the Emergence of Navigation in Embodied Rearrangement Proceedings Article

In: Oh, Alice H., Agarwal, Alekh, Belgrave, Danielle, Cho, Kyunghyun (Ed.): Advances in Neural Information Processing Systems (NeurIPS), 2022.

Abstract | Links | BibTeX | Tags: machine learning, NeurIPS, reinforcement learning, robotics

@inproceedings{2022-Wijmans-SOLENER,

title = {VER: Scaling On-Policy RL Leads to the Emergence of Navigation in Embodied Rearrangement},

author = {Erik Wijmans and Irfan Essa and Dhruv Batra},

editor = {Alice H. Oh and Alekh Agarwal and Danielle Belgrave and Kyunghyun Cho},

url = {https://arxiv.org/abs/2210.05064

https://openreview.net/forum?id=VrJWseIN98},

doi = {10.48550/ARXIV.2210.05064},

year  = {2022},

date = {2022-12-01},

urldate = {2022-12-01},

booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},

abstract = {We present Variable Experience Rollout (VER), a technique for efficiently scaling batched on-policy reinforcement learning in heterogenous environments (where different environments take vastly different times to generate rollouts) to many GPUs residing on, potentially, many machines. VER combines the strengths of and blurs the line between synchronous and asynchronous on-policy RL methods (SyncOnRL and AsyncOnRL, respectively). Specifically, it learns from on-policy experience (like SyncOnRL) and has no synchronization points (like AsyncOnRL) enabling high throughput. 

 

We find that VER leads to significant and consistent speed-ups across a broad range of embodied navigation and mobile manipulation tasks in photorealistic 3D simulation environments. Specifically, for PointGoal navigation and ObjectGoal navigation in Habitat 1.0, VER is 60-100% faster (1.6-2x speedup) than DD-PPO, the current state of art for distributed SyncOnRL, with similar sample efficiency. For mobile manipulation tasks (open fridge/cabinet, pick/place objects) in Habitat 2.0 VER is 150% faster (2.5x speedup) on 1 GPU and 170% faster (2.7x speedup) on 8 GPUs than DD-PPO. Compared to SampleFactory (the current state-of-the-art AsyncOnRL), VER matches its speed on 1 GPU, and is 70% faster (1.7x speedup) on 8 GPUs with better sample efficiency. 

 

We leverage these speed-ups to train chained skills for GeometricGoal rearrangement tasks in the Home Assistant Benchmark (HAB). We find a surprising emergence of navigation in skills that do not ostensible require any navigation. Specifically, the Pick skill involves a robot picking an object from a table. During training the robot was always spawned close to the table and never needed to navigate. However, we find that if base movement is part of the action space, the robot learns to navigate then pick an object in new environments with 50% success, demonstrating surprisingly high out-of-distribution generalization.},

keywords = {machine learning, NeurIPS, reinforcement learning, robotics},

pubstate = {published},

tppubtype = {inproceedings}

}

Close

5.

Niranjan Kumar, Irfan Essa, Sehoon Ha

Graph-based Cluttered Scene Generation and Interactive Exploration using Deep Reinforcement Learning Proceedings Article

In: Proceedings International Conference on Robotics and Automation (ICRA), pp. 7521-7527, 2022.

Abstract | Links | BibTeX | Tags: ICRA, machine learning, reinforcement learning, robotics

6.

Niranjan Kumar, Irfan Essa, Sehoon Ha

Cascaded Compositional Residual Learning for Complex Interactive Behaviors Proceedings Article

In: Sim-to-Real Robot Learning: Locomotion and Beyond Workshop at the Conference on Robot Learning (CoRL), arXiv, 2022.

Abstract | Links | BibTeX | Tags: reinforcement learning, robotics

7.

Niranjan Kumar, Irfan Essa, Sehoon Ha, C. Karen Liu

Estimating Mass Distribution of Articulated Objects through Non-prehensile Manipulation Proceedings Article

In: Neural Information Processing Systems (NeurIPS) Workshop on Object Representations for Learning and Reasoning, NeurIPS 2020.

Abstract | Links | BibTeX | Tags: reinforcement learning, robotics

@inproceedings{2020-Kumar-EMDAOTNM,

title = {Estimating Mass Distribution of Articulated Objects through Non-prehensile Manipulation},

author = {Niranjan Kumar and Irfan Essa and Sehoon Ha and C. Karen Liu},

url = {https://orlrworkshop.github.io/program/orlr_25.html

http://arxiv.org/abs/1907.03964

https://www.kniranjankumar.com/projects/1_mass_prediction

https://www.youtube.com/watch?v=o3zBdVWvWZw

https://kniranjankumar.github.io/assets/pdf/Estimating_Mass_Distribution_of_Articulated_Objects_using_Non_prehensile_Manipulation.pdf},

year  = {2020},

date = {2020-12-01},

urldate = {2020-12-01},

booktitle = {Neural Information Processing Systems (NeurIPS) Workshop on Object Representations for Learning and Reasoning},

organization = {NeurIPS},

abstract = {We explore the problem of estimating the mass distribution of an articulated object by an interactive robotic agent. Our method predicts the mass distribution of an object by using limited sensing and actuating capabilities of a robotic agent that is interacting with the object. We are inspired by the role of exploratory play in human infants. We take the combined approach of supervised and reinforcement learning to train an agent that learns to strategically interact with the object to estimate the object's mass distribution. Our method consists of two neural networks: (i) the policy network which decides how to interact with the object, and (ii) the predictor network that estimates the mass distribution given a history of observations and interactions. Using our method, we train a robotic arm to estimate the mass distribution of an object with moving parts (e.g. an articulated rigid body system) by pushing it on a surface with unknown friction properties. We also demonstrate how our training from simulations can be transferred to real hardware using a small amount of real-world data for fine-tuning. We use a UR10 robot to interact with 3D printed articulated chains with varying mass distributions and show that our method significantly outperforms the baseline system that uses random pushes to interact with the object.},

howpublished = {arXiv preprint arXiv:1907.03964},

keywords = {reinforcement learning, robotics},

pubstate = {published},

tppubtype = {inproceedings}

}