Publications

21.

Erik Wijmans, Manolis Savva, Irfan Essa, Stefan Lee, Ari S. Morcos, Dhruv Batra

Emergence of Maps in the Memories of Blind Navigation Agents Best Paper Proceedings Article

In: Proceedings of International Conference on Learning Representations (ICLR), 2023.

Abstract | Links | BibTeX | Tags: awards, best paper award, computer vision, google, ICLR, machine learning, robotics

@inproceedings{2023-Wijmans-EMMBNA,

title = {Emergence of Maps in the Memories of Blind Navigation Agents},

author = {Erik Wijmans and Manolis Savva and Irfan Essa and Stefan Lee and Ari S. Morcos and Dhruv Batra},

url = {https://arxiv.org/abs/2301.13261

https://wijmans.xyz/publication/eom/

https://openreview.net/forum?id=lTt4KjHSsyl

https://blog.iclr.cc/2023/03/21/announcing-the-iclr-2023-outstanding-paper-award-recipients/},

doi = {10.48550/ARXIV.2301.13261},

year  = {2023},

date = {2023-05-01},

urldate = {2023-05-01},

booktitle = {Proceedings of International Conference on Learning Representations (ICLR)},

abstract = {Animal navigation research posits that organisms build and maintain internal spatial representations, or maps, of their environment. We ask if machines -- specifically, artificial intelligence (AI) navigation agents -- also build implicit (or 'mental') maps. A positive answer to this question would (a) explain the surprising phenomenon in recent literature of ostensibly map-free neural-networks achieving strong performance, and (b) strengthen the evidence of mapping as a fundamental mechanism for navigation by intelligent embodied agents, whether they be biological or artificial. Unlike animal navigation, we can judiciously design the agent's perceptual system and control the learning paradigm to nullify alternative navigation mechanisms. Specifically, we train 'blind' agents -- with sensing limited to only egomotion and no other sensing of any kind -- to perform PointGoal navigation ('go to Δ x, Δ y') via reinforcement learning. Our agents are composed of navigation-agnostic components (fully-connected and recurrent neural networks), and our experimental setup provides no inductive bias towards mapping. Despite these harsh conditions, we find that blind agents are (1) surprisingly effective navigators in new environments (~95% success); (2) they utilize memory over long horizons (remembering ~1,000 steps of past experience in an episode); (3) this memory enables them to exhibit intelligent behavior (following walls, detecting collisions, taking shortcuts); (4) there is emergence of maps and collision detection neurons in the representations of the environment built by a blind agent as it navigates; and (5) the emergent maps are selective and task dependent (e.g. the agent 'forgets' exploratory detours). Overall, this paper presents no new techniques for the AI audience, but a surprising finding, an insight, and an explanation.},

keywords = {awards, best paper award, computer vision, google, ICLR, machine learning, robotics},

pubstate = {published},

tppubtype = {inproceedings}

}

Close

22.

Yi-Hao Peng, Peggy Chi, Anjuli Kannan, Meredith Morris, Irfan Essa

Slide Gestalt: Automatic Structure Extraction in Slide Decks for Non-Visual Access Proceedings Article

In: ACM Symposium on User Interface Software and Technology (UIST), 2023.

Abstract | Links | BibTeX | Tags: accessibility, CHI, google, human-computer interaction

23.

Karan Samel, Jun Ma, Zhengyang Wang, Tong Zhao, Irfan Essa

Knowledge Relevance BERT: Integrating Noisy Knowledge into Language Representation. Proceedings Article

In: AAAI workshop on Knowledge Augmented Methods for NLP (KnowledgeNLP-AAAI 2023), 2023.

Abstract | Links | BibTeX | Tags: AI, knowledge representation, NLP

24.

Tianhao Zhang, Weilong Yang, Honglak Lee, Hung-Yu Tseng, Irfan Essa, Lu Jiang

Image manipulation by text instruction Patent

2023.

Abstract | Links | BibTeX | Tags: content creation, generative AI, google, media generation, patents

25.

Erik Wijmans, Irfan Essa, Dhruv Batra

How to Train PointGoal Navigation Agents on a (Sample and Compute) Budget Proceedings Article

In: International Conference on Autonomous Agents and Multi-Agent Systems, 2022.

Abstract | Links | BibTeX | Tags: computer vision, embodied agents, navigation

26.

Erik Wijmans, Irfan Essa, Dhruv Batra

VER: Scaling On-Policy RL Leads to the Emergence of Navigation in Embodied Rearrangement Proceedings Article

In: Oh, Alice H., Agarwal, Alekh, Belgrave, Danielle, Cho, Kyunghyun (Ed.): Advances in Neural Information Processing Systems (NeurIPS), 2022.

Abstract | Links | BibTeX | Tags: machine learning, NeurIPS, reinforcement learning, robotics

@inproceedings{2022-Wijmans-SOLENER,

title = {VER: Scaling On-Policy RL Leads to the Emergence of Navigation in Embodied Rearrangement},

author = {Erik Wijmans and Irfan Essa and Dhruv Batra},

editor = {Alice H. Oh and Alekh Agarwal and Danielle Belgrave and Kyunghyun Cho},

url = {https://arxiv.org/abs/2210.05064

https://openreview.net/forum?id=VrJWseIN98},

doi = {10.48550/ARXIV.2210.05064},

year  = {2022},

date = {2022-12-01},

urldate = {2022-12-01},

booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},

abstract = {We present Variable Experience Rollout (VER), a technique for efficiently scaling batched on-policy reinforcement learning in heterogenous environments (where different environments take vastly different times to generate rollouts) to many GPUs residing on, potentially, many machines. VER combines the strengths of and blurs the line between synchronous and asynchronous on-policy RL methods (SyncOnRL and AsyncOnRL, respectively). Specifically, it learns from on-policy experience (like SyncOnRL) and has no synchronization points (like AsyncOnRL) enabling high throughput. 

 

We find that VER leads to significant and consistent speed-ups across a broad range of embodied navigation and mobile manipulation tasks in photorealistic 3D simulation environments. Specifically, for PointGoal navigation and ObjectGoal navigation in Habitat 1.0, VER is 60-100% faster (1.6-2x speedup) than DD-PPO, the current state of art for distributed SyncOnRL, with similar sample efficiency. For mobile manipulation tasks (open fridge/cabinet, pick/place objects) in Habitat 2.0 VER is 150% faster (2.5x speedup) on 1 GPU and 170% faster (2.7x speedup) on 8 GPUs than DD-PPO. Compared to SampleFactory (the current state-of-the-art AsyncOnRL), VER matches its speed on 1 GPU, and is 70% faster (1.7x speedup) on 8 GPUs with better sample efficiency. 

 

We leverage these speed-ups to train chained skills for GeometricGoal rearrangement tasks in the Home Assistant Benchmark (HAB). We find a surprising emergence of navigation in skills that do not ostensible require any navigation. Specifically, the Pick skill involves a robot picking an object from a table. During training the robot was always spawned close to the table and never needed to navigate. However, we find that if base movement is part of the action space, the robot learns to navigate then pick an object in new environments with 50% success, demonstrating surprisingly high out-of-distribution generalization.},

keywords = {machine learning, NeurIPS, reinforcement learning, robotics},

pubstate = {published},

tppubtype = {inproceedings}

}

Close

27.

Huda Alamri, Anthony Bilic, Michael Hu, Apoorva Beedu, Irfan Essa

End-to-end Multimodal Representation Learning for Video Dialog Proceedings Article

In: NeuRIPS Workshop on Vision Transformers: Theory and applications, 2022.

Abstract | Links | BibTeX | Tags: computational video, computer vision, vision transformers

28.

Apoorva Beedu, Huda Alamri, Irfan Essa

Video based Object 6D Pose Estimation using Transformers Proceedings Article

In: NeuRIPS Workshop on Vision Transformers: Theory and applications, 2022.

Abstract | Links | BibTeX | Tags: computer vision, vision transformers

29.

José Lezama, Huiwen Chang, Lu Jiang, Irfan Essa

Improved Masked Image Generation with Token-Critic Proceedings Article

In: European Conference on Computer Vision (ECCV), arXiv, 2022, ISBN: 978-3-031-20050-2.

Abstract | Links | BibTeX | Tags: computer vision, ECCV, generative AI, generative media, google

30.

Xiang Kong, Lu Jiang, Huiwen Chang, Han Zhang, Yuan Hao, Haifeng Gong, Irfan Essa

BLT: Bidirectional Layout Transformer for Controllable Layout Generation Proceedings Article

In: European Conference on Computer Vision (ECCV), 2022, ISBN: 978-3-031-19789-5.

Abstract | Links | BibTeX | Tags: computer vision, ECCV, generative AI, generative media, google, vision transformer

31.

Peggy Chi, Tao Dong, Christian Frueh, Brian Colonna, Vivek Kwatra, Irfan Essa

Synthesis-Assisted Video Prototyping From a Document Proceedings Article

In: Proceedings of the 35th Annual ACM Symposium on User Interface Software and Technology, pp. 1–10, 2022.

Abstract | Links | BibTeX | Tags: computational video, generative media, google, human-computer interaction, UIST, video editing

32.

Harish Haresamudram, Irfan Essa, Thomas Ploetz

Assessing the State of Self-Supervised Human Activity Recognition using Wearables Journal Article

In: Proceedings of the ACM on Interactive, Mobile, Wearable and Ubiquitous Technologies (IMWUT), vol. 6, iss. 3, no. 116, pp. 1–47, 2022.

Abstract | Links | BibTeX | Tags: activity recognition, IMWUT, ubiquitous computing, wearable computing

@article{2022-Haresamudram-ASSHARUW,

title = {Assessing the State of Self-Supervised Human Activity Recognition using Wearables},

author = {Harish Haresamudram and Irfan Essa and Thomas Ploetz},

url = {https://dl.acm.org/doi/10.1145/3550299

https://arxiv.org/abs/2202.12938

https://arxiv.org/pdf/2202.12938

},

doi = {doi.org/10.1145/3550299},

year  = {2022},

date = {2022-09-07},

urldate = {2022-09-07},

booktitle = {Proceedings of the ACM on Interactive, Mobile, Wearable and Ubiquitous Technologies (IMWUT)},

journal = {Proceedings of the ACM on Interactive, Mobile, Wearable and Ubiquitous Technologies (IMWUT)},

volume = {6},

number = {116},

issue = {3},

pages = {1–47},

publisher = {ACM},

abstract = {The emergence of self-supervised learning in the field of wearables-based human activity recognition (HAR) has opened up opportunities to tackle the most pressing challenges in the field, namely to exploit unlabeled data to derive reliable recognition systems for scenarios where only small amounts of labeled training samples can be collected. As such, self-supervision, i.e., the paradigm of 'pretrain-then-finetune' has the potential to become a strong alternative to the predominant end-to-end training approaches, let alone hand-crafted features for the classic activity recognition chain. Recently a number of contributions have been made that introduced self-supervised learning into the field of HAR, including, Multi-task self-supervision, Masked Reconstruction, CPC, and SimCLR, to name but a few. With the initial success of these methods, the time has come for a systematic inventory and analysis of the potential self-supervised learning has for the field. This paper provides exactly that. We assess the progress of self-supervised HAR research by introducing a framework that performs a multi-faceted exploration of model performance. We organize the framework into three dimensions, each containing three constituent criteria, such that each dimension captures specific aspects of performance, including the robustness to differing source and target conditions, the influence of dataset characteristics, and the feature space characteristics. We utilize this framework to assess seven state-of-the-art self-supervised methods for HAR, leading to the formulation of insights into the properties of these techniques and to establish their value towards learning representations for diverse scenarios.



},

keywords = {activity recognition, IMWUT, ubiquitous computing, wearable computing},

pubstate = {published},

tppubtype = {article}

}