A searchable list of some of my publications is below. You can also access my publications from the following sites.
My ORCID is
Publications:
Seung Hyun Lee, Yinxiao Li, Junjie Ke, Innfarn Yoo, Han Zhang, Jiahui Yu, Qifei Wang, Fei Deng, Glenn Entis, Junfeng He, Gang Li, Sangpil Kim, Irfan Essa, Feng Yang
Parrot: Pareto-optimal multi-reward reinforcement learning framework for text-to-image generation (inproceedings) Proceedings Article
In: Proceedings of European Conference on Computer Vision (ECCV) , 2024.
Abstract | Links | BibTeX | Tags: arXiv, computer vision, ECCV, generative AI, google, reinforcement learning
@inproceedings{2024-Lee-PPMRLFTG,
title = {Parrot: Pareto-optimal multi-reward reinforcement learning framework for text-to-image generation (inproceedings)},
author = {Seung Hyun Lee and Yinxiao Li and Junjie Ke and Innfarn Yoo and Han Zhang and Jiahui Yu and Qifei Wang and Fei Deng and Glenn Entis and Junfeng He and Gang Li and Sangpil Kim and Irfan Essa and Feng Yang
},
url = {https://arxiv.org/abs/2401.05675
https://arxiv.org/pdf/2401.05675
https://dl.acm.org/doi/10.1007/978-3-031-72920-1_26},
doi = {10.48550/arXiv.2401.05675},
year = {2024},
date = {2024-07-25},
urldate = {2024-07-25},
booktitle = {Proceedings of European Conference on Computer Vision (ECCV)
},
abstract = {Recent works have demonstrated that using reinforcement learning (RL) with multiple quality rewards can improve the quality of generated images in text-to-image (T2I) generation. However, manually adjusting reward weights poses challenges and may cause over-optimization in certain metrics. To solve this, we propose Parrot, which addresses the issue through multi-objective optimization and introduces an effective multi-reward optimization strategy to approximate Pareto optimal. Utilizing batch-wise Pareto optimal selection, Parrot automatically identifies the optimal trade-off among different rewards. We use the novel multi-reward optimization algorithm to jointly optimize the T2I model and a prompt expansion network, resulting in significant improvement of image quality and also allow to control the trade-off of different rewards using a reward related prompt during inference. Furthermore, we introduce original prompt-centered guidance at inference time, ensuring fidelity to user input after prompt expansion. Extensive experiments and a user study validate the superiority of Parrot over several baselines across various quality criteria, including aesthetics, human preference, text-image alignment, and image sentiment.
},
keywords = {arXiv, computer vision, ECCV, generative AI, google, reinforcement learning},
pubstate = {published},
tppubtype = {inproceedings}
}
Agrim Gupta, Lijun Yu, Kihyuk Sohn, Xiuye Gu, Meera Hahn, Li Fei-Fei, Irfan Essa, Lu Jiang, José Lezama
Photorealistic Video Generation with Diffusion Models Proceedings Article
In: European Conference on Computer Vision (ECCV), 2024.
Abstract | Links | BibTeX | Tags: arXiv, computational video, computer vision, generative AI, google
@inproceedings{2024-Gupta-PVGWDM,
title = {Photorealistic Video Generation with Diffusion Models},
author = {Agrim Gupta and Lijun Yu and Kihyuk Sohn and Xiuye Gu and Meera Hahn and Li Fei-Fei and Irfan Essa and Lu Jiang and José Lezama
},
url = {https://walt-video-diffusion.github.io/
https://arxiv.org/abs/2312.06662
https://arxiv.org/pdf/2312.06662
},
doi = {10.48550/arXiv.2312.06662},
year = {2024},
date = {2024-07-25},
urldate = {2024-07-25},
booktitle = {European Conference on Computer Vision (ECCV)},
abstract = {We present W.A.L.T, a transformer-based approach for photorealistic video generation via diffusion modeling. Our approach has two key design decisions. First, we use a causal encoder to jointly compress images and videos within a unified latent space, enabling training and generation across modalities. Second, for memory and training efficiency, we use a window attention architecture tailored for joint spatial and spatiotemporal generative modeling. Taken together these design decisions enable us to achieve state-of-the-art performance on established video (UCF-101 and Kinetics-600) and image (ImageNet) generation benchmarks without using classifier free guidance. Finally, we also train a cascade of three models for the task of text-to-video generation consisting of a base latent video diffusion model, and two video super-resolution diffusion models to generate videos of 512×896 resolution at 8 frames per second.},
keywords = {arXiv, computational video, computer vision, generative AI, google},
pubstate = {published},
tppubtype = {inproceedings}
}
Dan Kondratyuk, Lijun Yu, Xiuye Gu, José Lezama, Jonathan Huang, Grant Schindler, Rachel Hornung, Vighnesh Birodkar, Jimmy Yan, Ming-Chang Chiu, Krishna Somandepalli, Hassan Akbari, Yair Alon, Yong Cheng, Josh Dillon, Agrim Gupta, Meera Hahn, Anja Hauth, David Hendon, Alonso Martinez, David Minnen, Mikhail Sirotenko, Kihyuk Sohn, Xuan Yang, Hartwig Adam, Ming-Hsuan Yang, Irfan Essa, Huisheng Wang, David A. Ross, Bryan Seybold, Lu Jiang
VideoPoet: A large language model for zero-shot video generation Best Paper Proceedings Article
In: Proceedings of International Conference on Machine Learning (ICML), 2024.
Abstract | Links | BibTeX | Tags: arXiv, best paper award, computational video, computer vision, generative AI, google, ICML
@inproceedings{2024-Kondratyuk-VLLMZVG,
title = {VideoPoet: A large language model for zero-shot video generation},
author = {Dan Kondratyuk and Lijun Yu and Xiuye Gu and José Lezama and Jonathan Huang and Grant Schindler and Rachel Hornung and Vighnesh Birodkar and Jimmy Yan and Ming-Chang Chiu and Krishna Somandepalli and Hassan Akbari and Yair Alon and Yong Cheng and Josh Dillon and Agrim Gupta and Meera Hahn and Anja Hauth and David Hendon and Alonso Martinez and David Minnen and Mikhail Sirotenko and Kihyuk Sohn and Xuan Yang and Hartwig Adam and Ming-Hsuan Yang and Irfan Essa and Huisheng Wang and David A. Ross and Bryan Seybold and Lu Jiang
},
url = {https://arxiv.org/pdf/2312.14125
https://arxiv.org/abs/2312.14125
https://sites.research.google/videopoet/},
doi = {10.48550/arXiv.2312.14125},
year = {2024},
date = {2024-07-23},
urldate = {2024-07-23},
booktitle = {Proceedings of International Conference on Machine Learning (ICML)},
abstract = {We present VideoPoet, a language model capable of synthesizing high-quality video, with matching audio, from a large variety of conditioning signals. VideoPoet employs a decoder-only transformer architecture that processes multimodal inputs -- including images, videos, text, and audio. The training protocol follows that of Large Language Models (LLMs), consisting of two stages: pretraining and task-specific adaptation. During pretraining, VideoPoet incorporates a mixture of multimodal generative objectives within an autoregressive Transformer framework. The pretrained LLM serves as a foundation that can be adapted for a range of video generation tasks. We present empirical results demonstrating the model's state-of-the-art capabilities in zero-shot video generation, specifically highlighting VideoPoet's ability to generate high-fidelity motions. Project page: http://sites.research.google/videopoet/
},
keywords = {arXiv, best paper award, computational video, computer vision, generative AI, google, ICML},
pubstate = {published},
tppubtype = {inproceedings}
}
Xingqian Xu, Jiayi Guo, Zhangyang Wang, Gao Huang, Irfan Essa, Humphrey Shi
Prompt-Free Diffusion: Taking "Text" out of Text-to-Image Diffusion Models Proceedings Article
In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) , pp. 8682–8692, 2024.
Abstract | Links | BibTeX | Tags: arXiv, computer vision, CVPR, generative AI
@inproceedings{2024-Xu-PDTTTDM,
title = {Prompt-Free Diffusion: Taking "Text" out of Text-to-Image Diffusion Models},
author = {Xingqian Xu and Jiayi Guo and Zhangyang Wang and Gao Huang and Irfan Essa and Humphrey Shi
},
url = {https://openaccess.thecvf.com/content/CVPR2024/papers/Xu_Prompt-Free_Diffusion_Taking_Text_out_of_Text-to-Image_Diffusion_Models_CVPR_2024_paper.pdf
https://openaccess.thecvf.com/content/CVPR2024/html/Xu_Prompt-Free_Diffusion_Taking_Text_out_of_Text-to-Image_Diffusion_Models_CVPR_2024_paper.html
https://arxiv.org/abs/2305.16223
},
doi = {10.48550/arXiv.2305.16223},
year = {2024},
date = {2024-06-18},
urldate = {2024-06-18},
booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)
},
pages = {8682--8692},
abstract = {Text-to-image (T2I) research has grown explosively in the past year owing to the large-scale pre-trained diffusion models and many emerging personalization and editing approaches. Yet one pain point persists: the text prompt engineering and searching high-quality text prompts for customized results is more art than science. Moreover as commonly argued: "an image is worth a thousand words" - the attempt to describe a desired image with texts often ends up being ambiguous and cannot comprehensively cover delicate visual details hence necessitating more additional controls from the visual domain. In this paper we take a bold step forward: taking "Text" out of a pretrained T2I diffusion model to reduce the burdensome prompt engineering efforts for users. Our proposed framework Prompt-Free Diffusion relies on only visual inputs to generate new images: it takes a reference image as "context" an optional image structural conditioning and an initial noise with absolutely no text prompt. The core architecture behind the scene is Semantic Context Encoder (SeeCoder) substituting the commonly used CLIP-based or LLM-based text encoder. The reusability of SeeCoder also makes it a convenient drop-in component: one can also pre-train a SeeCoder in one T2I model and reuse it for another. Through extensive experiments Prompt-Free Diffusion is experimentally found to (i) outperform prior exemplar-based image synthesis approaches; (ii) perform on par with state-of-the-art T2I models using prompts following the best practice; and (iii) be naturally extensible to other downstream applications such as anime figure generation and virtual try-on with promising quality. Our code and models will be open-sourced.
},
keywords = {arXiv, computer vision, CVPR, generative AI},
pubstate = {published},
tppubtype = {inproceedings}
}
Lijun Yu, José Lezama, Nitesh B. Gundavarapu, Luca Versari, Kihyuk Sohn, David Minnen, Yong Cheng, Vighnesh Birodkar, Agrim Gupta, Xiuye Gu, Alexander G. Hauptmann, Boqing Gong, Ming-Hsuan Yang, Irfan Essa, David A. Ross, Lu Jiang
Language Model Beats Diffusion -- Tokenizer is Key to Visual Generation Proceedings Article
In: Proceedings of International Conference on Learning Representations (ICLR) , 2024.
Abstract | Links | BibTeX | Tags: AI, arXiv, computer vision, generative AI, google, ICLR
@inproceedings{2024-Yu-LMBDVG,
title = {Language Model Beats Diffusion -- Tokenizer is Key to Visual Generation},
author = {Lijun Yu and José Lezama and Nitesh B. Gundavarapu and Luca Versari and Kihyuk Sohn and David Minnen and Yong Cheng and Vighnesh Birodkar and Agrim Gupta and Xiuye Gu and Alexander G. Hauptmann and Boqing Gong and Ming-Hsuan Yang and Irfan Essa and David A. Ross and Lu Jiang},
url = {https://arxiv.org/abs/2310.05737
https://arxiv.org/pdf/2310.05737},
doi = { https://doi.org/10.48550/arXiv.2310.05737},
year = {2024},
date = {2024-05-14},
urldate = {2024-05-14},
booktitle = {Proceedings of International Conference on Learning Representations (ICLR)
},
abstract = {While Large Language Models (LLMs) are the dominant models for generative tasks in language, they do not perform as well as diffusion models on image and video generation. To effectively use LLMs for visual generation, one crucial component is the visual tokenizer that maps pixel-space inputs to discrete tokens appropriate for LLM learning. In this paper, we introduce MAGVIT-v2, a video tokenizer designed to generate concise and expressive tokens for both videos and images using a common token vocabulary. Equipped with this new tokenizer, we show that LLMs outperform diffusion models on standard image and video generation benchmarks including ImageNet and Kinetics. In addition, we demonstrate that our tokenizer surpasses the previously top-performing video tokenizer on two more tasks: (1) video compression comparable to the next-generation video codec (VCC) according to human evaluations, and (2) learning effective representations for action recognition tasks.
},
keywords = {AI, arXiv, computer vision, generative AI, google, ICLR},
pubstate = {published},
tppubtype = {inproceedings}
}
Harish Haresamudram, Irfan Essa, Thomas Ploetz
Towards Learning Discrete Representations via Self-Supervision for Wearables-Based Human Activity Recognition Journal Article
In: Sensors, vol. 24, no. 4, 2024.
Abstract | Links | BibTeX | Tags: activity recognition, arXiv, wearable computing
@article{2023-Haresamudram-TLDRSWHAR,
title = {Towards Learning Discrete Representations via Self-Supervision for Wearables-Based Human Activity Recognition},
author = {Harish Haresamudram and Irfan Essa and Thomas Ploetz},
url = {https://arxiv.org/abs/2306.01108
https://www.mdpi.com/1424-8220/24/4/1238},
doi = {10.48550/arXiv.2306.01108},
year = {2024},
date = {2024-02-24},
urldate = {2023-06-01},
journal = {Sensors},
volume = {24},
number = {4},
abstract = {Human activity recognition (HAR) in wearable computing is typically based on direct processing of sensor data. Sensor readings are translated into representations, either derived through dedicated preprocessing, or integrated into end-to-end learning. Independent of their origin, for the vast majority of contemporary HAR, those representations are typically continuous in nature. That has not always been the case. In the early days of HAR, discretization approaches have been explored - primarily motivated by the desire to minimize computational requirements, but also with a view on applications beyond mere recognition, such as, activity discovery, fingerprinting, or large-scale search. Those traditional discretization approaches, however, suffer from substantial loss in precision and resolution in the resulting representations with detrimental effects on downstream tasks. Times have changed and in this paper we propose a return to discretized representations. We adopt and apply recent advancements in Vector Quantization (VQ) to wearables applications, which enables us to directly learn a mapping between short spans of sensor data and a codebook of vectors, resulting in recognition performance that is generally on par with their contemporary, continuous counterparts - sometimes surpassing them. Therefore, this work presents a proof-of-concept for demonstrating how effective discrete representations can be derived, enabling applications beyond mere activity classification but also opening up the field to advanced tools for the analysis of symbolic sequences, as they are known, for example, from domains such as natural language processing. Based on an extensive experimental evaluation on a suite of wearables-based benchmark HAR tasks, we demonstrate the potential of our learned discretization scheme and discuss how discretized sensor data analysis can lead to substantial changes in HAR.},
howpublished = {arXiv:2306.01108},
keywords = {activity recognition, arXiv, wearable computing},
pubstate = {published},
tppubtype = {article}
}
Kihyuk Sohn, Nataniel Ruiz, Kimin Lee, Daniel Castro Chin, Irina Blok, Huiwen Chang, Jarred Barber, Lu Jiang, Glenn Entis, Yuanzhen Li, Yuan Hao, Irfan Essa, Michael Rubinstein, Dilip Krishnan
StyleDrop: Text-to-Image Generation in Any Style Proceedings Article
In: Advances in Neural Information Processing Systems (NeurIPS), 2023.
Abstract | Links | BibTeX | Tags: arXiv, computer vision, generative AI, google, NeurIPS
@inproceedings{2023-Sohn-STGS,
title = {StyleDrop: Text-to-Image Generation in Any Style},
author = {Kihyuk Sohn and Nataniel Ruiz and Kimin Lee and Daniel Castro Chin and Irina Blok and Huiwen Chang and Jarred Barber and Lu Jiang and Glenn Entis and Yuanzhen Li and Yuan Hao and Irfan Essa and Michael Rubinstein and Dilip Krishnan},
url = {https://arxiv.org/abs/2306.00983
https://openreview.net/forum?id=KoaFh16uOc
https://proceedings.neurips.cc/paper_files/paper/2023/hash/d33b177b69425e7685b0b1c05bd2a5e4-Abstract-Conference.html},
doi = {10.48550/arXiv.2306.00983},
year = {2023},
date = {2023-12-11},
urldate = {2023-12-11},
booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
abstract = {Pre-trained large text-to-image models synthesize impressive images with an appropriate use of text prompts. However, ambiguities inherent in natural language and out-of-distribution effects make it hard to synthesize image styles, that leverage a specific design pattern, texture or material. In this paper, we introduce StyleDrop, a method that enables the synthesis of images that faithfully follow a specific style using a text-to-image model. The proposed method is extremely versatile and captures nuances and details of a user-provided style, such as color schemes, shading, design patterns, and local and global effects. It efficiently learns a new style by fine-tuning very few trainable parameters (less than 1% of total model parameters) and improving the quality via iterative training with either human or automated feedback. Better yet, StyleDrop is able to deliver impressive results even when the user supplies only a single image that specifies the desired style. An extensive study shows that, for the task of style tuning text-to-image models, StyleDrop implemented on Muse convincingly outperforms other methods, including DreamBooth and textual inversion on Imagen or Stable Diffusion. More results are available at our project website: this https URL},
howpublished = {arXiv:2306.00983},
keywords = {arXiv, computer vision, generative AI, google, NeurIPS},
pubstate = {published},
tppubtype = {inproceedings}
}
Lijun Yu, Yong Cheng, Zhiruo Wang, Vivek Kumar, Wolfgang Macherey, Yanping Huang, David A. Ross, Irfan Essa, Yonatan Bisk, Ming-Hsuan Yang, Kevin Murphy, Alexander G. Hauptmann, Lu Jiang
SPAE: Semantic Pyramid AutoEncoder for Multimodal Generation with Frozen LLMs Proceedings Article
In: Advances in Neural Information Processing Systems (NeurIPS), 2023.
Abstract | Links | BibTeX | Tags: arXiv, computational video, computer vision, generative AI, NeurIPS
@inproceedings{2023-Yu-SSPAMGWFL,
title = {SPAE: Semantic Pyramid AutoEncoder for Multimodal Generation with Frozen LLMs},
author = {Lijun Yu and Yong Cheng and Zhiruo Wang and Vivek Kumar and Wolfgang Macherey and Yanping Huang and David A. Ross and Irfan Essa and Yonatan Bisk and Ming-Hsuan Yang and Kevin Murphy and Alexander G. Hauptmann and Lu Jiang},
url = {https://arxiv.org/abs/2306.17842
https://openreview.net/forum?id=CXPUg86A1D
https://proceedings.neurips.cc/paper_files/paper/2023/hash/a526cc8f6ffb74bedb6ff313e3fdb450-Abstract-Conference.html},
doi = {10.48550/arXiv.2306.17842},
year = {2023},
date = {2023-12-11},
urldate = {2023-12-11},
booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
abstract = {In this work, we introduce Semantic Pyramid AutoEncoder (SPAE) for enabling frozen LLMs to perform both understanding and generation tasks involving non-linguistic modalities such as images or videos. SPAE converts between raw pixels and interpretable lexical tokens (or words) extracted from the LLM's vocabulary. The resulting tokens capture both the semantic meaning and the fine-grained details needed for visual reconstruction, effectively translating the visual content into a language comprehensible to the LLM, and empowering it to perform a wide array of multimodal tasks. Our approach is validated through in-context learning experiments with frozen PaLM 2 and GPT 3.5 on a diverse set of image understanding and generation tasks. Our method marks the first successful attempt to enable a frozen LLM to generate image content while surpassing state-of-the-art performance in image understanding tasks, under the same setting, by over 25%.},
howpublished = {Advances in Neural Information Processing Systems (NeurIPS) (arXiv:2306.17842v2)},
keywords = {arXiv, computational video, computer vision, generative AI, NeurIPS},
pubstate = {published},
tppubtype = {inproceedings}
}
Nikolai Warner, Meera Hahn, Jonathan Huang, Irfan Essa, Vighnesh Birodkar
Text and Click inputs for unambiguous open vocabulary instance segmentation Proceedings Article
In: Proeedings of British Conference for Machine Vision (BMVC), 2023.
Abstract | Links | BibTeX | Tags: arXiv, BMVC, computer vision, google, image segmentation
@inproceedings{2023-Warner-TACIFUOVIS,
title = {Text and Click inputs for unambiguous open vocabulary instance segmentation},
author = {Nikolai Warner and Meera Hahn and Jonathan Huang and Irfan Essa and Vighnesh Birodkar},
url = {https://doi.org/10.48550/arXiv.2311.14822
https://arxiv.org/abs/2311.14822
https://arxiv.org/pdf/2311.14822.pdf},
doi = {arXiv.2311.14822},
year = {2023},
date = {2023-11-24},
urldate = {2023-11-24},
booktitle = {Proeedings of British Conference for Machine Vision (BMVC)},
abstract = {Segmentation localizes objects in an image on a fine-grained per-pixel scale. Segmentation benefits by humans-in-the-loop to provide additional input of objects to segment using a combination of foreground or background clicks. Tasks include photoediting or novel dataset annotation, where human annotators leverage an existing segmentation model instead of drawing raw pixel level annotations. We propose a new segmentation process, Text + Click segmentation, where a model takes as input an image, a text phrase describing a class to segment, and a single foreground click specifying the instance to segment. Compared to previous approaches, we leverage open-vocabulary image-text models to support a wide-range of text prompts. Conditioning segmentations on text prompts improves the accuracy of segmentations on novel or unseen classes. We demonstrate that the combination of a single user-specified foreground click and a text prompt allows a model to better disambiguate overlapping or co-occurring semantic categories, such as "tie", "suit", and "person". We study these results across common segmentation datasets such as refCOCO, COCO, VOC, and OpenImages. Source code available here.
},
keywords = {arXiv, BMVC, computer vision, google, image segmentation},
pubstate = {published},
tppubtype = {inproceedings}
}
K. Niranjan Kumar, Irfan Essa, Sehoon Ha
Words into Action: Learning Diverse Humanoid Robot Behaviors using Language Guided Iterative Motion Refinement Proceedings Article
In: CoRL Workshop on Language and Robot Learning Language as Grounding (with CoRL 2023), 2023.
Abstract | Links | BibTeX | Tags: arXiv, CoRL, robotics, vision & language
@inproceedings{2023-Kumar-WIALDHRBULGIM,
title = {Words into Action: Learning Diverse Humanoid Robot Behaviors using Language Guided Iterative Motion Refinement},
author = {K. Niranjan Kumar and Irfan Essa and Sehoon Ha},
url = {https://doi.org/10.48550/arXiv.2310.06226
https://arxiv.org/abs/2310.06226
https://arxiv.org/pdf/2310.06226.pdf
https://www.kniranjankumar.com/words_into_action/
},
doi = {10.48550/arXiv.2310.06226},
year = {2023},
date = {2023-11-01},
urldate = {2023-11-01},
booktitle = {CoRL Workshop on Language and Robot Learning Language as Grounding (with CoRL 2023)},
abstract = {We present a method to simplify controller design by enabling users to train and fine-tune robot control policies using natural language commands. We first learn a neural network policy that generates behaviors given a natural language command, such as “walk forward”, by combining Large Language Models (LLMs), motion retargeting, and motion imitation. Based on the synthesized motion, we iteratively fine-tune by updating the text prompt and querying LLMs to find the best checkpoint associated with the closest motion in history.},
keywords = {arXiv, CoRL, robotics, vision & language},
pubstate = {published},
tppubtype = {inproceedings}
}
Kihyuk Sohn, Albert Shaw, Yuan Hao, Han Zhang, Luisa Polania, Huiwen Chang, Lu Jiang, Irfan Essa
Learning Disentangled Prompts for Compositional Image Synthesis Technical Report
2023.
Abstract | Links | BibTeX | Tags: arXiv, computer vision, generative AI, google, prompt engineering
@techreport{2023-Sohn-LDPCIS,
title = {Learning Disentangled Prompts for Compositional Image Synthesis},
author = {Kihyuk Sohn and Albert Shaw and Yuan Hao and Han Zhang and Luisa Polania and Huiwen Chang and Lu Jiang and Irfan Essa},
url = {https://arxiv.org/abs/2306.00763},
doi = { https://doi.org/10.48550/arXiv.2306.00763},
year = {2023},
date = {2023-06-01},
urldate = {2023-06-01},
abstract = {We study domain-adaptive image synthesis, the problem of teaching pretrained image generative models a new style or concept from as few as one image to synthesize novel images, to better understand the compositional image synthesis. We present a framework that leverages a pre-trained class-conditional generation model and visual prompt tuning. Specifically, we propose a novel source class distilled visual prompt that learns disentangled prompts of semantic (e.g., class) and domain (e.g., style) from a few images. Learned domain prompt is then used to synthesize images of any classes in the style of target domain. We conduct studies on various target domains with the number of images ranging from one to a few to many, and show qualitative results which show the compositional generalization of our method. Moreover, we show that our method can help improve zero-shot domain adaptation classification accuracy.
},
howpublished = {arXiv:2306.00763 },
keywords = {arXiv, computer vision, generative AI, google, prompt engineering},
pubstate = {published},
tppubtype = {techreport}
}
Apoorva Beedu, Zhile Ren, Varun Agrawal, Irfan Essa
VideoPose: Estimating 6D object pose from videos Technical Report
2021.
Abstract | Links | BibTeX | Tags: arXiv, computer vision, object detection, pose estimation
@techreport{2021-Beedu-VEOPFV,
title = {VideoPose: Estimating 6D object pose from videos},
author = {Apoorva Beedu and Zhile Ren and Varun Agrawal and Irfan Essa},
url = {https://arxiv.org/abs/2111.10677},
doi = {10.48550/arXiv.2111.10677},
year = {2021},
date = {2021-11-01},
urldate = {2021-11-01},
journal = {arXiv preprint arXiv:2111.10677},
abstract = {We introduce a simple yet effective algorithm that uses convolutional neural networks to directly estimate object poses from videos. Our approach leverages the temporal information from a video sequence, and is computationally efficient and robust to support robotic and AR domains. Our proposed network takes a pre-trained 2D object detector as input, and aggregates visual features through a recurrent neural network to make predictions at each frame. Experimental evaluation on the YCB-Video dataset show that our approach is on par with the state-of-the-art algorithms. Further, with a speed of 30 fps, it is also more efficient than the state-of-the-art, and therefore applicable to a variety of applications that require real-time object pose estimation.},
keywords = {arXiv, computer vision, object detection, pose estimation},
pubstate = {published},
tppubtype = {techreport}
}
Karan Samel, Zelin Zhao, Binghong Chen, Shuang Li, Dharmashankar Subramanian, Irfan Essa, Le Song
Neural Temporal Logic Programming Technical Report
2021.
Abstract | Links | BibTeX | Tags: activity recognition, arXiv, machine learning, openreview
@techreport{2021-Samel-NTLP,
title = {Neural Temporal Logic Programming},
author = {Karan Samel and Zelin Zhao and Binghong Chen and Shuang Li and Dharmashankar Subramanian and Irfan Essa and Le Song},
url = {https://openreview.net/forum?id=i7h4M45tU8},
year = {2021},
date = {2021-09-01},
urldate = {2021-09-01},
abstract = {Events across a timeline are a common data representation, seen in different temporal modalities. Individual atomic events can occur in a certain temporal ordering to compose higher-level composite events. Examples of a composite event are a patient's medical symptom or a baseball player hitting a home run, caused distinct temporal orderings of patient vitals and player movements respectively. Such salient composite events are provided as labels in temporal datasets and most works optimize models to predict these composite event labels directly. We focus uncovering the underlying atomic events and their relations that lead to the composite events within a noisy temporal data setting. We propose Neural Temporal Logic Programming (Neural TLP) which first learns implicit temporal relations between atomic events and then lifts logic rules for composite events, given only the composite events labels for supervision. This is done through efficiently searching through the combinatorial space of all temporal logic rules in an end-to-end differentiable manner. We evaluate our method on video and on healthcare data where it outperforms the baseline methods for rule discovery. },
howpublished = {https://openreview.net/forum?id=i7h4M45tU8},
keywords = {activity recognition, arXiv, machine learning, openreview},
pubstate = {published},
tppubtype = {techreport}
}
Dan Scarafoni, Irfan Essa, Thomas Ploetz
PLAN-B: Predicting Likely Alternative Next Best Sequences for Action Prediction Technical Report
no. arXiv:2103.15987, 2021.
Abstract | Links | BibTeX | Tags: activity recognition, arXiv, computer vision
@techreport{2021-Scarafoni-PPLANBSAP,
title = {PLAN-B: Predicting Likely Alternative Next Best Sequences for Action Prediction},
author = {Dan Scarafoni and Irfan Essa and Thomas Ploetz},
url = {https://arxiv.org/abs/2103.15987},
doi = {10.48550/arXiv.2103.15987},
year = {2021},
date = {2021-03-01},
urldate = {2021-03-01},
journal = {arXiv},
number = {arXiv:2103.15987},
abstract = {Action prediction focuses on anticipating actions before they happen. Recent works leverage probabilistic approaches to describe future uncertainties and sample future actions. However, these methods cannot easily find all alternative predictions, which are essential given the inherent unpredictability of the future, and current evaluation protocols do not measure a system's ability to find such alternatives. We re-examine action prediction in terms of its ability to predict not only the top predictions, but also top alternatives with the accuracy@k metric. In addition, we propose Choice F1: a metric inspired by F1 score which evaluates a prediction system's ability to find all plausible futures while keeping only the most probable ones. To evaluate this problem, we present a novel method, Predicting the Likely Alternative Next Best, or PLAN-B, for action prediction which automatically finds the set of most likely alternative futures. PLAN-B consists of two novel components: (i) a Choice Table which ensures that all possible futures are found, and (ii) a "Collaborative" RNN system which combines both action sequence and feature information. We demonstrate that our system outperforms state-of-the-art results on benchmark datasets.
},
keywords = {activity recognition, arXiv, computer vision},
pubstate = {published},
tppubtype = {techreport}
}
Erik Wijmans, Julian Straub, Dhruv Batra, Irfan Essa, Judy Hoffman, Ari Morcos
Analyzing Visual Representations in Embodied Navigation Tasks Technical Report
no. arXiv:2003.05993, 2020.
Abstract | Links | BibTeX | Tags: arXiv, embodied agents, navigation
@techreport{2020-Wijmans-AVRENT,
title = {Analyzing Visual Representations in Embodied Navigation Tasks},
author = {Erik Wijmans and Julian Straub and Dhruv Batra and Irfan Essa and Judy Hoffman and Ari Morcos},
url = {https://arxiv.org/abs/2003.05993
https://arxiv.org/pdf/2003.05993},
doi = {10.48550/arXiv.2003.05993},
year = {2020},
date = {2020-03-01},
urldate = {2020-03-01},
journal = {arXiv},
number = {arXiv:2003.05993},
abstract = {Recent advances in deep reinforcement learning require a large amount of training data and generally result in representations that are often over specialized to the target task. In this work, we present a methodology to study the underlying potential causes for this specialization. We use the recently proposed projection weighted Canonical Correlation Analysis (PWCCA) to measure the similarity of visual representations learned in the same environment by performing different tasks.
We then leverage our proposed methodology to examine the task dependence of visual representations learned on related but distinct embodied navigation tasks. Surprisingly, we find that slight differences in task have no measurable effect on the visual representation for both SqueezeNet and ResNet architectures. We then empirically demonstrate that visual representations learned on one task can be effectively transferred to a different task.},
howpublished = {arXiv:2003.05993},
keywords = {arXiv, embodied agents, navigation},
pubstate = {published},
tppubtype = {techreport}
}
We then leverage our proposed methodology to examine the task dependence of visual representations learned on related but distinct embodied navigation tasks. Surprisingly, we find that slight differences in task have no measurable effect on the visual representation for both SqueezeNet and ResNet architectures. We then empirically demonstrate that visual representations learned on one task can be effectively transferred to a different task.
Jonathan C Balloch, Varun Agrawal, Irfan Essa, Sonia Chernova
Unbiasing Semantic Segmentation For Robot Perception using Synthetic Data Feature Transfer Technical Report
no. arXiv:1809.03676, 2018.
Abstract | Links | BibTeX | Tags: arXiv, robotics, scene understanding
@techreport{2018-Balloch-USSRPUSDFT,
title = {Unbiasing Semantic Segmentation For Robot Perception using Synthetic Data Feature Transfer},
author = {Jonathan C Balloch and Varun Agrawal and Irfan Essa and Sonia Chernova},
url = {https://doi.org/10.48550/arXiv.1809.03676},
doi = {10.48550/arXiv.1809.03676},
year = {2018},
date = {2018-09-01},
urldate = {2018-09-01},
journal = {arXiv},
number = {arXiv:1809.03676},
abstract = {Robot perception systems need to perform reliable image segmentation in real-time on noisy, raw perception data. State-of-the-art segmentation approaches use large CNN models and carefully constructed datasets; however, these models focus on accuracy at the cost of real-time inference. Furthermore, the standard semantic segmentation datasets are not large enough for training CNNs without augmentation and are not representative of noisy, uncurated robot perception data. We propose improving the performance of real-time segmentation frameworks on robot perception data by transferring features learned from synthetic segmentation data. We show that pretraining real-time segmentation architectures with synthetic segmentation data instead of ImageNet improves fine-tuning performance by reducing the bias learned in pretraining and closing the textit{transfer gap} as a result. Our experiments show that our real-time robot perception models pretrained on synthetic data outperform those pretrained on ImageNet for every scale of fine-tuning data examined. Moreover, the degree to which synthetic pretraining outperforms ImageNet pretraining increases as the availability of robot data decreases, making our approach attractive for robotics domains where dataset collection is hard and/or expensive.
},
howpublished = {arXiv:1809.03676},
keywords = {arXiv, robotics, scene understanding},
pubstate = {published},
tppubtype = {techreport}
}
Steven Hickson, Anelia Angelova, Irfan Essa, Rahul Sukthankar
Object category learning and retrieval with weak supervision Technical Report
no. arXiv:1801.08985, 2018.
Abstract | Links | BibTeX | Tags: arXiv, computer vision, machine learning, object detection
@techreport{2018-Hickson-OCLRWWS,
title = {Object category learning and retrieval with weak supervision},
author = {Steven Hickson and Anelia Angelova and Irfan Essa and Rahul Sukthankar},
url = {https://arxiv.org/abs/1801.08985
https://arxiv.org/pdf/1801.08985},
doi = {10.48550/arXiv.1801.08985},
year = {2018},
date = {2018-07-01},
urldate = {2018-07-01},
journal = {arXiv},
number = {arXiv:1801.08985},
abstract = {We consider the problem of retrieving objects from image data and learning to classify them into meaningful semantic categories with minimal supervision. To that end, we propose a fully differentiable unsupervised deep clustering approach to learn semantic classes in an end-to-end fashion without individual class labeling using only unlabeled object proposals. The key contributions of our work are 1) a kmeans clustering objective where the clusters are learned as parameters of the network and are represented as memory units, and 2) simultaneously building a feature representation, or embedding, while learning to cluster it. This approach shows promising results on two popular computer vision datasets: on CIFAR10 for clustering objects, and on the more complex and challenging Cityscapes dataset for semantically discovering classes which visually correspond to cars, people, and bicycles. Currently, the only supervision provided is segmentation objectness masks, but this method can be extended to use an unsupervised objectness-based object generation mechanism which will make the approach completely unsupervised.
},
howpublished = {arXiv:1801.08985},
keywords = {arXiv, computer vision, machine learning, object detection},
pubstate = {published},
tppubtype = {techreport}
}
Huda Alamri, Vincent Cartillier, Raphael Gontijo Lopes, Abhishek Das, Jue Wang, Irfan Essa, Dhruv Batra, Devi Parikh, Anoop Cherian, Tim K Marks, Chiori Hori
Audio Visual Scene-Aware Dialog (AVSD) Challenge at DSTC7 Technical Report
no. arXiv:1806.00525, 2018.
Abstract | Links | BibTeX | Tags: arXiv, embodied agents, multimedia, vision & language
@techreport{2018-Alamri-AVSDACD,
title = {Audio Visual Scene-Aware Dialog (AVSD) Challenge at DSTC7},
author = {Huda Alamri and Vincent Cartillier and Raphael Gontijo Lopes and Abhishek Das and Jue Wang and Irfan Essa and Dhruv Batra and Devi Parikh and Anoop Cherian and Tim K Marks and Chiori Hori},
url = {https://video-dialog.com/
https://arxiv.org/abs/1806.00525},
doi = {10.48550/arXiv.1806.00525},
year = {2018},
date = {2018-06-01},
urldate = {2018-06-01},
journal = {arXiv},
number = {arXiv:1806.00525},
abstract = {Scene-aware dialog systems will be able to have conversations with users about the objects and events around them. Progress on such systems can be made by integrating state-of-the-art technologies from multiple research areas including end-to-end dialog systems visual dialog, and video description. We introduce the Audio Visual Scene Aware Dialog (AVSD) challenge and dataset. In this challenge, which is one track of the 7th Dialog System Technology Challenges (DSTC7) workshop1, the task is to build a system that generates responses in a dialog about an input video
},
howpublished = {arXiv:1806.00525},
keywords = {arXiv, embodied agents, multimedia, vision & language},
pubstate = {published},
tppubtype = {techreport}
}
Other Publication Sites
A few more sites that aggregate research publications: Academic.edu, Bibsonomy, CiteULike, Mendeley.
Copyright/About
[Please see the Copyright Statement that may apply to the content listed here.]
This list of publications is produced by using the teachPress plugin for WordPress.