A searchable list of some of my publications is below. You can also access my publications from the following sites.
My ORCID is
Publications:
Lijun Yu, José Lezama, Nitesh B. Gundavarapu, Luca Versari, Kihyuk Sohn, David Minnen, Yong Cheng, Vighnesh Birodkar, Agrim Gupta, Xiuye Gu, Alexander G. Hauptmann, Boqing Gong, Ming-Hsuan Yang, Irfan Essa, David A. Ross, Lu Jiang
Language Model Beats Diffusion -- Tokenizer is Key to Visual Generation Proceedings Article
In: Proceedings of International Conference on Learning Representations (ICLR) , 2024.
Abstract | Links | BibTeX | Tags: AI, arXiv, computer vision, generative AI, google, ICLR
@inproceedings{2024-Yu-LMBDVG,
title = {Language Model Beats Diffusion -- Tokenizer is Key to Visual Generation},
author = {Lijun Yu and José Lezama and Nitesh B. Gundavarapu and Luca Versari and Kihyuk Sohn and David Minnen and Yong Cheng and Vighnesh Birodkar and Agrim Gupta and Xiuye Gu and Alexander G. Hauptmann and Boqing Gong and Ming-Hsuan Yang and Irfan Essa and David A. Ross and Lu Jiang},
url = {https://arxiv.org/abs/2310.05737
https://arxiv.org/pdf/2310.05737},
doi = { https://doi.org/10.48550/arXiv.2310.05737},
year = {2024},
date = {2024-05-14},
urldate = {2024-05-14},
booktitle = {Proceedings of International Conference on Learning Representations (ICLR)
},
abstract = {While Large Language Models (LLMs) are the dominant models for generative tasks in language, they do not perform as well as diffusion models on image and video generation. To effectively use LLMs for visual generation, one crucial component is the visual tokenizer that maps pixel-space inputs to discrete tokens appropriate for LLM learning. In this paper, we introduce MAGVIT-v2, a video tokenizer designed to generate concise and expressive tokens for both videos and images using a common token vocabulary. Equipped with this new tokenizer, we show that LLMs outperform diffusion models on standard image and video generation benchmarks including ImageNet and Kinetics. In addition, we demonstrate that our tokenizer surpasses the previously top-performing video tokenizer on two more tasks: (1) video compression comparable to the next-generation video codec (VCC) according to human evaluations, and (2) learning effective representations for action recognition tasks.
},
keywords = {AI, arXiv, computer vision, generative AI, google, ICLR},
pubstate = {published},
tppubtype = {inproceedings}
}
Karan Samel, Jun Ma, Zhengyang Wang, Tong Zhao, Irfan Essa
Knowledge Relevance BERT: Integrating Noisy Knowledge into Language Representation. Proceedings Article
In: AAAI workshop on Knowledge Augmented Methods for NLP (KnowledgeNLP-AAAI 2023), 2023.
Abstract | Links | BibTeX | Tags: AI, knowledge representation, NLP
@inproceedings{2023-Samel-KRBINKILR,
title = {Knowledge Relevance BERT: Integrating Noisy Knowledge into Language Representation.},
author = {Karan Samel and Jun Ma and Zhengyang Wang and Tong Zhao and Irfan Essa},
url = {https://knowledge-nlp.github.io/aaai2023/papers/005-KRBERT-oral.pdf},
year = {2023},
date = {2023-02-01},
urldate = {2023-02-01},
booktitle = {AAAI workshop on Knowledge Augmented Methods for NLP (KnowledgeNLP-AAAI 2023)},
abstract = {Integrating structured knowledge into language model representations increases recall of domain-specific information useful for downstream tasks. Matching between knowledge graph entities and text entity mentions can be easily performed when entity names are unique or entity-linking data exists. When extending this setting to new domains, newly mined knowledge contains ambiguous and incorrect information without explicit linking information. In such settings, we design a framework to robustly link relevant knowledge to input texts as an intermediate modeling step while performing end-to-end domain fine-tuning tasks. This is done by first computing the similarity of the existing task labels with candidate knowledge triplets to generate relevance labels. We use these labels to train a relevance model, which predicts the relevance of the inserted triplets to the original text. This relevance model is integrated within a language model, leading to our Knowledge Relevance BERT (KR-BERT) framework. We test KR-BERT for linking and ranking tasks on a real-world e-commerce dataset and a public entity linking task, where we show performance improvements over strong baselines.},
keywords = {AI, knowledge representation, NLP},
pubstate = {published},
tppubtype = {inproceedings}
}
Vincent Cartillier, Zhile Ren, Neha Jain, Stefan Lee, Irfan Essa, Dhruv Batra
Semantic MapNet: Building Allocentric SemanticMaps and Representations from Egocentric Views Proceedings Article
In: Proceedings of American Association of Artificial Intelligence Conference (AAAI), AAAI, 2021.
Abstract | Links | BibTeX | Tags: AAAI, AI, embodied agents, first-person vision
@inproceedings{2021-Cartillier-SMBASRFEV,
title = {Semantic MapNet: Building Allocentric SemanticMaps and Representations from Egocentric Views},
author = {Vincent Cartillier and Zhile Ren and Neha Jain and Stefan Lee and Irfan Essa and Dhruv Batra},
url = {https://arxiv.org/abs/2010.01191
https://vincentcartillier.github.io/smnet.html
https://ojs.aaai.org/index.php/AAAI/article/view/16180/15987},
doi = {10.48550/arXiv.2010.01191},
year = {2021},
date = {2021-02-01},
urldate = {2021-02-01},
booktitle = {Proceedings of American Association of Artificial Intelligence Conference (AAAI)},
publisher = {AAAI},
abstract = {We study the task of semantic mapping -- specifically, an embodied agent (a robot or an egocentric AI assistant) is given a tour of a new environment and asked to build an allocentric top-down semantic map (`what is where?') from egocentric observations of an RGB-D camera with known pose (via localization sensors). Importantly, our goal is to build neural episodic memories and spatio-semantic representations of 3D spaces that enable the agent to easily learn subsequent tasks in the same space -- navigating to objects seen during the tour (`Find chair') or answering questions about the space (`How many chairs did you see in the house?').
Towards this goal, we present Semantic MapNet (SMNet), which consists of: (1) an Egocentric
Visual Encoder that encodes each egocentric RGB-D frame, (2) a Feature Projector that projects egocentric features to appropriate locations on a floor-plan, (3) a Spatial Memory Tensor of size floor-plan length × width × feature-dims that learns to accumulate projected egocentric features, and (4) a Map Decoder that uses the memory tensor to produce semantic top-down maps. SMNet combines the strengths of (known) projective camera geometry and neural representation learning. On the task of semantic mapping in the Matterport3D dataset, SMNet significantly outperforms competitive baselines by 4.01-16.81% (absolute) on mean-IoU and 3.81-19.69% (absolute) on Boundary-F1 metrics. Moreover, we show how to use the spatio-semantic allocentric representations build by SMNet for the task of ObjectNav and Embodied Question Answering.},
keywords = {AAAI, AI, embodied agents, first-person vision},
pubstate = {published},
tppubtype = {inproceedings}
}
Towards this goal, we present Semantic MapNet (SMNet), which consists of: (1) an Egocentric
Visual Encoder that encodes each egocentric RGB-D frame, (2) a Feature Projector that projects egocentric features to appropriate locations on a floor-plan, (3) a Spatial Memory Tensor of size floor-plan length × width × feature-dims that learns to accumulate projected egocentric features, and (4) a Map Decoder that uses the memory tensor to produce semantic top-down maps. SMNet combines the strengths of (known) projective camera geometry and neural representation learning. On the task of semantic mapping in the Matterport3D dataset, SMNet significantly outperforms competitive baselines by 4.01-16.81% (absolute) on mean-IoU and 3.81-19.69% (absolute) on Boundary-F1 metrics. Moreover, we show how to use the spatio-semantic allocentric representations build by SMNet for the task of ObjectNav and Embodied Question Answering.
Zoher Ghogawala, Melissa Dunbar, Irfan Essa
Artificial Intelligence for the Treatment of Lumbar Spondylolisthesis Journal Article
In: Neurosurgery Clinics of North America, vol. 30, no. 3, pp. 383 - 389, 2019, ISSN: 1042-3680, (Lumbar Spondylolisthesis).
Abstract | Links | BibTeX | Tags: AI, computational health, Predictive analytics
@article{2019-Ghogawala-AITLS,
title = {Artificial Intelligence for the Treatment of Lumbar Spondylolisthesis},
author = {Zoher Ghogawala and Melissa Dunbar and Irfan Essa},
url = {http://www.sciencedirect.com/science/article/pii/S1042368019300257
https://pubmed.ncbi.nlm.nih.gov/31078239/},
doi = {10.1016/j.nec.2019.02.012},
issn = {1042-3680},
year = {2019},
date = {2019-07-01},
urldate = {2019-07-01},
journal = {Neurosurgery Clinics of North America},
volume = {30},
number = {3},
pages = {383 - 389},
abstract = {Multiple registries are currently collecting patient-specific data on lumbar spondylolisthesis including outcomes data. The collection of imaging diagnostics data along with comparative outcomes data following decompression versus decompression and fusion treatments for degenerative spondylolisthesis represents an enormous opportunity for modern machine-learning analytics research.
},
note = {Lumbar Spondylolisthesis},
keywords = {AI, computational health, Predictive analytics},
pubstate = {published},
tppubtype = {article}
}
Zoher Ghogawala, Melissa Dunbar, Irfan Essa
Lumbar spondylolisthesis: modern registries and the development of artificial intelligence Journal Article
In: Journal of Neurosurgery: Spine (JNSPG 75th Anniversary Invited Review Article), vol. 30, no. 6, pp. 729-735, 2019.
Links | BibTeX | Tags: AI, computational health, Predictive analytics
@article{2019-Ghogawala-LSMRDAI,
title = {Lumbar spondylolisthesis: modern registries and the development of artificial intelligence},
author = {Zoher Ghogawala and Melissa Dunbar and Irfan Essa},
doi = {10.3171/2019.2.SPINE18751},
year = {2019},
date = {2019-06-01},
urldate = {2019-06-01},
journal = {Journal of Neurosurgery: Spine (JNSPG 75th Anniversary Invited Review Article)},
volume = {30},
number = {6},
pages = {729-735},
keywords = {AI, computational health, Predictive analytics},
pubstate = {published},
tppubtype = {article}
}
Edison Thomaz, Cheng Zhang, Irfan Essa, Gregory Abowd
Inferring Meal Eating Activities in Real World Settings from Ambient Sounds: A Feasibility Study Best Paper Proceedings Article
In: ACM Conference on Intelligence User Interfaces (IUI), 2015.
Abstract | Links | BibTeX | Tags: ACM, activity recognition, AI, awards, behavioral imaging, best paper award, computational health, IUI, machine learning
@inproceedings{2015-Thomaz-IMEARWSFASFS,
title = {Inferring Meal Eating Activities in Real World Settings from Ambient Sounds: A Feasibility Study},
author = {Edison Thomaz and Cheng Zhang and Irfan Essa and Gregory Abowd},
url = {https://dl.acm.org/doi/10.1145/2678025.2701405},
doi = {10.1145/2678025.2701405},
year = {2015},
date = {2015-05-01},
urldate = {2015-05-01},
booktitle = {ACM Conference on Intelligence User Interfaces (IUI)},
abstract = {Dietary self-monitoring has been shown to be an effective method for weight-loss, but it remains an onerous task despite recent advances in food journaling systems. Semi-automated food journaling can reduce the effort of logging, but often requires that eating activities be detected automatically. In this work we describe results from a feasibility study conducted in-the-wild where eating activities were inferred from ambient sounds captured with a wrist-mounted device; twenty participants wore the device during one day for an average of 5 hours while performing normal everyday activities. Our system was able to identify meal eating with an F-score of 79.8% in a person-dependent evaluation, and with 86.6% accuracy in a person-independent evaluation. Our approach is intended to be practical, leveraging off-the-shelf devices with audio sensing capabilities in contrast to systems for automated dietary assessment based on specialized sensors.},
keywords = {ACM, activity recognition, AI, awards, behavioral imaging, best paper award, computational health, IUI, machine learning},
pubstate = {published},
tppubtype = {inproceedings}
}
Other Publication Sites
A few more sites that aggregate research publications: Academic.edu, Bibsonomy, CiteULike, Mendeley.
Copyright/About
[Please see the Copyright Statement that may apply to the content listed here.]
This list of publications is produced by using the teachPress plugin for WordPress.