A searchable list of some of my publications is below. You can also access my publications from the following sites.
My ORCID is https://orcid.org/0000-0002-6236-2969Publications:
Hsin-Ying Lee, Lu Jiang, Irfan Essa, Madison Le, Haifeng Gong, Ming-Hsuan Yang, Weilong Yang
Neural Design Network: Graphic Layout Generation with Constraints Proceedings Article
In: Proceedings of European Conference on Computer Vision (ECCV), 2020.
Links | BibTeX | Tags: computer vision, content creation, ECCV, generative media, google
@inproceedings{2020-Lee-NDNGLGWC,
title = {Neural Design Network: Graphic Layout Generation with Constraints},
author = {Hsin-Ying Lee and Lu Jiang and Irfan Essa and Madison Le and Haifeng Gong and Ming-Hsuan Yang and Weilong Yang},
url = {https://arxiv.org/abs/1912.09421
https://rdcu.be/c7sqw},
doi = {10.1007/978-3-030-58580-8_29},
year = {2020},
date = {2020-08-01},
urldate = {2020-08-01},
booktitle = {Proceedings of European Conference on Computer Vision (ECCV)},
keywords = {computer vision, content creation, ECCV, generative media, google},
pubstate = {published},
tppubtype = {inproceedings}
}
Caroline Pantofaru, Vinay Bettadapura, Krishna Bharat, Irfan Essa
Systems and methods for directing content generation using a first-person point-of-view device. Patent
2020.
Abstract | Links | BibTeX | Tags: computer vision, google, patents
@patent{2020-Pantofaru-SMDCGUFPD,
title = {Systems and methods for directing content generation using a first-person point-of-view device.},
author = {Caroline Pantofaru and Vinay Bettadapura and Krishna Bharat and Irfan Essa},
url = {https://patents.google.com/patent/US10721439},
year = {2020},
date = {2020-07-21},
urldate = {2020-07-01},
publisher = {(US Patent # 10721439)},
abstract = {A method for personalizing a content item using captured footage is disclosed. The method includes receiving a first video feed from a first camera, wherein the first camera is designated as a source camera for capturing an event during a first time duration. The method also includes receiving data from a second camera, and determining, based on the received data from the second camera, that an action was performed using the second camera, the action being indicative of a region of interest (ROI) of the user of the second camera occurring within a second time duration. The method further includes designating the second camera as the source camera for capturing the event during the second time duration.
},
howpublished = {US Patent # 10721439},
keywords = {computer vision, google, patents},
pubstate = {published},
tppubtype = {patent}
}
Steven Hickson, Karthik Raveendran, Alireza Fathi, Kevin Murphy, Irfan Essa
Floors are Flat: Leveraging Semantics for Real-Time Surface Normal Prediction Proceedings Article
In: IEEE International Conference on Computer Vision (ICCV) Workshop on Geometry Meets Deep Learning, 2019.
Abstract | Links | BibTeX | Tags: computer vision, google, ICCV
@inproceedings{2019-Hickson-FFLSRSNP,
title = {Floors are Flat: Leveraging Semantics for Real-Time Surface Normal Prediction},
author = {Steven Hickson and Karthik Raveendran and Alireza Fathi and Kevin Murphy and Irfan Essa},
url = {https://arxiv.org/abs/1906.06792
https://openaccess.thecvf.com/content_ICCVW_2019/papers/GMDL/Hickson_Floors_are_Flat_Leveraging_Semantics_for_Real-Time_Surface_Normal_Prediction_ICCVW_2019_paper.pdf},
doi = {10.1109/ICCVW.2019.00501},
year = {2019},
date = {2019-10-01},
urldate = {2019-10-01},
booktitle = {IEEE International Conference on Computer Vision (ICCV) Workshop on Geometry Meets Deep Learning},
abstract = {We propose 4 insights that help to significantly improve the performance of deep learning models that predict surface normals and semantic labels from a single RGB image. These insights are: (1) denoise the "ground truth" surface normals in the training set to ensure consistency with the semantic labels; (2) concurrently train on a mix of real and synthetic data, instead of pretraining on synthetic and fine-tuning on real; (3) jointly predict normals and semantics using a shared model, but only backpropagate errors on pixels that have valid training labels; (4) slim down the model and use grayscale instead of color inputs. Despite the simplicity of these steps, we demonstrate consistently improved state of the art results on several datasets, using a model that runs at 12 fps on a standard mobile phone.
},
howpublished = {arXiv preprint arXiv:1906.06792},
keywords = {computer vision, google, ICCV},
pubstate = {published},
tppubtype = {inproceedings}
}
Thad Eugene Starner, Irfan Essa, Hayes Solos Raffle, Daniel Aminzade
Object occlusion to initiate a visual search Patent
2019, (US Patent 10,437,882).
Abstract | Links | BibTeX | Tags: computer vision, google, patents
@patent{2019-Starner-OOIVS,
title = {Object occlusion to initiate a visual search},
author = {Thad Eugene Starner and Irfan Essa and Hayes Solos Raffle and Daniel Aminzade},
url = {https://patents.google.com/patent/US10437882},
year = {2019},
date = {2019-10-01},
urldate = {2019-10-01},
publisher = {(US Patent # 10437882)},
abstract = {Methods, systems, and apparatus, including computer programs encoded on computer storage media, for video segmentation. One of the methods includes receiving a digital video; performing hierarchical graph-based video segmentation on at least one frame of the digital video to generate a boundary representation for the at least one frame; generating a vector representation from the boundary representation for the at least one frame of the digital video, wherein generating the vector representation includes generating a polygon composed of at least three vectors, wherein each vector comprises two vertices connected by a line segment, from a boundary in the boundary representation; linking the vector representation to the at least one frame of the digital video; and storing the vector representation with the at least one frame of the digital video.
},
howpublished = {US Patent # 10437882},
note = {US Patent 10,437,882},
keywords = {computer vision, google, patents},
pubstate = {published},
tppubtype = {patent}
}
Huda Alamri, Vincent Cartillier, Abhishek Das, Jue Wang, Anoop Cherian, Irfan Essa, Dhruv Batra, Tim K. Marks, Chiori Hori, Peter Anderson, Stefan Lee, Devi Parikh
Audio Visual Scene-Aware Dialog Proceedings Article
In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2019.
Abstract | Links | BibTeX | Tags: computational video, computer vision, CVPR, embodied agents, vision & language
@inproceedings{2019-Alamri-AVSD,
title = {Audio Visual Scene-Aware Dialog},
author = {Huda Alamri and Vincent Cartillier and Abhishek Das and Jue Wang and Anoop Cherian and Irfan Essa and Dhruv Batra and Tim K. Marks and Chiori Hori and Peter Anderson and Stefan Lee and Devi Parikh},
url = {https://openaccess.thecvf.com/content_CVPR_2019/papers/Alamri_Audio_Visual_Scene-Aware_Dialog_CVPR_2019_paper.pdf
https://video-dialog.com/
https://arxiv.org/abs/1901.09107},
doi = {10.1109/CVPR.2019.00774},
year = {2019},
date = {2019-06-01},
urldate = {2019-06-01},
booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
abstract = {We introduce the task of scene-aware dialog. Our goal is to generate a complete and natural response to a question about a scene, given video and audio of the scene and the history of previous turns in the dialog. To answer successfully, agents must ground concepts from the question in the video while leveraging contextual cues from the dialog history. To benchmark this task, we introduce the Audio Visual Scene-Aware Dialog (AVSD) Dataset. For each of more than 11,000 videos of human actions from the Charades dataset, our dataset contains a dialog about the video, plus a final summary of the video by one of the dialog participants. We train several baseline systems for this task and evaluate the performance of the trained models using both qualitative and quantitative metrics. Our results indicate that models must utilize all the available inputs (video, audio, question, and dialog history) to perform best on this dataset.
},
keywords = {computational video, computer vision, CVPR, embodied agents, vision & language},
pubstate = {published},
tppubtype = {inproceedings}
}
Erik Wijmans, Samyak Datta, Oleksandr Maksymets, Abhishek Das, Georgia Gkioxari, Stefan Lee, Irfan Essa, Devi Parikh, Dhruv Batra
Embodied Question Answering in Photorealistic Environments With Point Cloud Perception Proceedings Article
In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2019.
Links | BibTeX | Tags: computer vision, CVPR, vision & language
@inproceedings{2019-Wijmans-EQAPEWPCP,
title = {Embodied Question Answering in Photorealistic Environments With Point Cloud Perception},
author = {Erik Wijmans and Samyak Datta and Oleksandr Maksymets and Abhishek Das and Georgia Gkioxari and Stefan Lee and Irfan Essa and Devi Parikh and Dhruv Batra},
doi = {10.1109/CVPR.2019.00682},
year = {2019},
date = {2019-06-01},
booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
keywords = {computer vision, CVPR, vision & language},
pubstate = {published},
tppubtype = {inproceedings}
}
Unaiza Ahsan, Rishi Madhok, Irfan Essa
Video Jigsaw: Unsupervised Learning of Spatiotemporal Context for Video Action Recognition Proceedings Article
In: IEEE Winter Conference on Applications of Computer Vision (WACV), pp. 179-189, 2019, ISSN: 1550-5790.
Links | BibTeX | Tags: activity recognition, computer vision, machine learning, WACV
@inproceedings{2019-Ahsan-VJULSCVAR,
title = {Video Jigsaw: Unsupervised Learning of Spatiotemporal Context for Video Action Recognition},
author = {Unaiza Ahsan and Rishi Madhok and Irfan Essa},
url = {https://ieeexplore.ieee.org/abstract/document/8659002},
doi = {10.1109/WACV.2019.00025},
issn = {1550-5790},
year = {2019},
date = {2019-01-01},
urldate = {2019-01-01},
booktitle = {IEEE Winter Conference on Applications of Computer Vision (WACV)},
pages = {179-189},
keywords = {activity recognition, computer vision, machine learning, WACV},
pubstate = {published},
tppubtype = {inproceedings}
}
Irfan Essa, Vivek Kwatra, Matthias Grundmann
Vector representation for video segmentation Patent
2018, (US Patent Application 14/587,420).
Links | BibTeX | Tags: computer vision, google, patents
@patent{2018-Essa-VRVS,
title = {Vector representation for video segmentation},
author = {Irfan Essa and Vivek Kwatra and Matthias Grundmann},
url = {https://patents.google.com/patent/US20180350131},
year = {2018},
date = {2018-12-06},
urldate = {2018-12-01},
publisher = {(US Patent Application # 14/587,420)},
howpublished = {US Patent # US20180350131A1},
note = {US Patent Application 14/587,420},
keywords = {computer vision, google, patents},
pubstate = {published},
tppubtype = {patent}
}
Caroline Pantofaru, Vinay Bettadapura, Krishna Bharat, Irfan Essa
Systems and methods for directing content generation using a first-person point-of-view device Patent
2018, (US Patent 10,110,850).
Abstract | Links | BibTeX | Tags: computer vision, google, patents
@patent{2018-Pantofaru-SMDCGUFPD,
title = {Systems and methods for directing content generation using a first-person point-of-view device},
author = {Caroline Pantofaru and Vinay Bettadapura and Krishna Bharat and Irfan Essa},
url = {https://patents.google.com/patent/US10110850},
year = {2018},
date = {2018-10-23},
urldate = {2018-10-01},
publisher = {(US Patent #10110850)},
abstract = {A method for localizing the attention of a user of a first-person point-of-view (FPPOV) device is disclosed. The method includes receiving data from an FPPOV device, the data being indicative of a first region-of-interest (ROI) of an event for a first time duration and a second ROI of the event for a second time duration. The method further include determining that a first camera from a plurality of cameras best captures the first ROI during the first time duration, and determining that a second camera from the plurality of cameras best captures the second ROI during the second time duration.
},
howpublished = {US Patent # US10110850B1},
note = {US Patent 10,110,850},
keywords = {computer vision, google, patents},
pubstate = {published},
tppubtype = {patent}
}
Unaiza Ahsan, Rishi Madhok, Irfan Essa
Video Jigsaw: Unsupervised Learning of Spatiotemporal Context for Video Action Recognition Journal Article
In: arXiv, no. arXiv:1808.07507, 2018.
BibTeX | Tags: activity recognition, computer vision, machine learning
@article{2018-Ahsan-VJULSCVAR,
title = {Video Jigsaw: Unsupervised Learning of Spatiotemporal Context for Video Action Recognition},
author = {Unaiza Ahsan and Rishi Madhok and Irfan Essa},
year = {2018},
date = {2018-08-01},
journal = {arXiv},
number = {arXiv:1808.07507},
keywords = {activity recognition, computer vision, machine learning},
pubstate = {published},
tppubtype = {article}
}
Steven Hickson, Anelia Angelova, Irfan Essa, Rahul Sukthankar
Object category learning and retrieval with weak supervision Technical Report
no. arXiv:1801.08985, 2018.
Abstract | Links | BibTeX | Tags: arXiv, computer vision, machine learning, object detection
@techreport{2018-Hickson-OCLRWWS,
title = {Object category learning and retrieval with weak supervision},
author = {Steven Hickson and Anelia Angelova and Irfan Essa and Rahul Sukthankar},
url = {https://arxiv.org/abs/1801.08985
https://arxiv.org/pdf/1801.08985},
doi = {10.48550/arXiv.1801.08985},
year = {2018},
date = {2018-07-01},
urldate = {2018-07-01},
journal = {arXiv},
number = {arXiv:1801.08985},
abstract = {We consider the problem of retrieving objects from image data and learning to classify them into meaningful semantic categories with minimal supervision. To that end, we propose a fully differentiable unsupervised deep clustering approach to learn semantic classes in an end-to-end fashion without individual class labeling using only unlabeled object proposals. The key contributions of our work are 1) a kmeans clustering objective where the clusters are learned as parameters of the network and are represented as memory units, and 2) simultaneously building a feature representation, or embedding, while learning to cluster it. This approach shows promising results on two popular computer vision datasets: on CIFAR10 for clustering objects, and on the more complex and challenging Cityscapes dataset for semantically discovering classes which visually correspond to cars, people, and bicycles. Currently, the only supervision provided is segmentation objectness masks, but this method can be extended to use an unsupervised objectness-based object generation mechanism which will make the approach completely unsupervised.
},
howpublished = {arXiv:1801.08985},
keywords = {arXiv, computer vision, machine learning, object detection},
pubstate = {published},
tppubtype = {techreport}
}
Matthias Grundmann, Vivek Kwatra, Irfan Essa
2018, (US Patent 9,888,180).
Links | BibTeX | Tags: computer vision, google, patents
@patent{2018-Grundmann-CCMERSDCSDVS,
title = {Cascaded camera motion estimation, rolling shutter detection, and camera shake detection for video stabilization},
author = {Matthias Grundmann and Vivek Kwatra and Irfan Essa},
url = {https://patents.google.com/patent/US9888180},
year = {2018},
date = {2018-02-06},
urldate = {2018-02-01},
publisher = {(US Patent #9888180)},
howpublished = {US Patent # US9888180},
note = {US Patent 9,888,180},
keywords = {computer vision, google, patents},
pubstate = {published},
tppubtype = {patent}
}
Unaiza Ahsan, Chen Sun, Irfan Essa
DiscrimNet: Semi-Supervised Action Recognition from Videos using Generative Adversarial Networks Journal Article
In: arXiv, no. arXiv:1801.07230, 2018.
BibTeX | Tags: activity recognition, computer vision, machine learning
@article{2018-Ahsan-DSARFVUGAN,
title = {DiscrimNet: Semi-Supervised Action Recognition from Videos using Generative Adversarial Networks},
author = {Unaiza Ahsan and Chen Sun and Irfan Essa},
year = {2018},
date = {2018-01-01},
journal = {arXiv},
number = {arXiv:1801.07230},
keywords = {activity recognition, computer vision, machine learning},
pubstate = {published},
tppubtype = {article}
}
Unaiza Ahsan, Munmun De Choudhury, Irfan Essa
Towards Using Visual Attributes to Infer Image Sentiment Of Social Events Proceedings Article
In: Proceedings of The International Joint Conference on Neural Networks, International Neural Network Society, Anchorage, Alaska, US, 2017.
Abstract | Links | BibTeX | Tags: computational journalism, computer vision, IJNN, machine learning
@inproceedings{2017-Ahsan-TUVAIISSE,
title = {Towards Using Visual Attributes to Infer Image Sentiment Of Social Events},
author = {Unaiza Ahsan and Munmun De Choudhury and Irfan Essa},
url = {https://ieeexplore.ieee.org/abstract/document/7966013},
doi = {10.1109/IJCNN.2017.7966013},
year = {2017},
date = {2017-05-01},
urldate = {2017-05-01},
booktitle = {Proceedings of The International Joint Conference on Neural Networks},
publisher = {International Neural Network Society},
address = {Anchorage, Alaska, US},
abstract = {Widespread and pervasive adoption of smartphones has led to instant sharing of photographs that capture events ranging from mundane to life-altering happenings. We propose to capture sentiment information of such social event images leveraging their visual content. Our method extracts an intermediate visual representation of social event images based on the visual attributes that occur in the images going beyond sentiment-specific attributes. We map the top predicted attributes to sentiments and extract the dominant emotion associated with a picture of a social event. Unlike recent approaches, our method generalizes to a variety of social events and even to unseen events, which are not available at training time. We demonstrate the effectiveness of our approach on a challenging social event image dataset and our method outperforms state-of-the-art approaches for classifying complex event images into sentiments.
},
keywords = {computational journalism, computer vision, IJNN, machine learning},
pubstate = {published},
tppubtype = {inproceedings}
}
Irfan Essa, Matthias Grundmann, Jessica Hodgins, Kihwan Kim, Iain Matthews, Ariel Shamir
System and method for utilizing motion fields to predict evolution in dynamic scenes Patent
2017.
Abstract | Links | BibTeX | Tags: computer vision, patents, sports visualization
@patent{2017-Essa-SAMFUMFTPEIDS,
title = {System and method for utilizing motion fields to predict evolution in dynamic scenes},
author = {Irfan Essa and Matthias Grundmann and Jessica Hodgins and Kihwan Kim and Iain Matthews and Ariel Shamir},
url = {https://patents.google.com/patent/US9600760},
year = {2017},
date = {2017-03-21},
abstract = {Described herein are methods, systems, apparatuses and products for utilizing motion fields to predict evolution in dynamic scenes. One aspect provides for accessing active object position data including positioning information of a plurality of individual active objects; extracting a plurality of individual active object motions from the active object position data; constructing a motion field using the plurality of individual active object motions; and using the motion field to predict one or more points of convergence at one or more spatial locations that active objects are proceeding towards at a future point in time. Other embodiments are disclosed.
},
howpublished = {US Patent #US9600760},
keywords = {computer vision, patents, sports visualization},
pubstate = {published},
tppubtype = {patent}
}
Unaiza Ahsan, Chen Sun, James Hays, Irfan Essa
Complex Event Recognition from Images with Few Training Examples Proceedings Article
In: IEEE Winter Conference on Applications of Computer Vision (WACV), 2017.
Abstract | Links | BibTeX | Tags: activity recognition, computer vision, machine learning, WACV
@inproceedings{2017-Ahsan-CERFIWTE,
title = {Complex Event Recognition from Images with Few Training Examples},
author = {Unaiza Ahsan and Chen Sun and James Hays and Irfan Essa},
url = {https://arxiv.org/abs/1701.04769
https://www.computer.org/csdl/proceedings-article/wacv/2017/07926663/12OmNzZEAzy},
doi = {10.1109/WACV.2017.80},
year = {2017},
date = {2017-03-01},
urldate = {2017-03-01},
booktitle = {IEEE Winter Conference on Applications of Computer Vision (WACV)},
abstract = {We propose to leverage concept-level representations for complex event recognition in photographs given limited training examples. We introduce a novel framework to discover event concept attributes from the web and use that to extract semantic features from images and classify them into social event categories with few training examples. Discovered concepts include a variety of objects, scenes, actions and event sub-types, leading to a discriminative and compact representation for event images. Web images are obtained for each discovered event concept and we use (pretrained) CNN features to train concept classifiers. Extensive experiments on challenging event datasets demonstrate that our proposed method outperforms several baselines using deep CNN features directly in classifying images into events with limited training examples. We also demonstrate that our method achieves the best overall accuracy on a dataset with unseen event categories using a single training example.
},
keywords = {activity recognition, computer vision, machine learning, WACV},
pubstate = {published},
tppubtype = {inproceedings}
}
Aneeq Zia, Daniel Castro, Irfan Essa
Fine-tuning Deep Architectures for Surgical Tool Detection Proceedings Article
In: Workshop and Challenges on Modeling and Monitoring of Computer Assisted Interventions (M2CAI), Held in Conjunction with International Conference on Medical Image Computing and Computer Assisted Intervention (MICCAI), Athens, Greece, 2016.
Abstract | Links | BibTeX | Tags: activity assessment, computer vision, MICCAI, surgical training
@inproceedings{2016-Zia-FDASTD,
title = {Fine-tuning Deep Architectures for Surgical Tool Detection},
author = {Aneeq Zia and Daniel Castro and Irfan Essa},
url = {http://www.cc.gatech.edu/cpl/projects/deepm2cai/
https://www.cc.gatech.edu/cpl/projects/deepm2cai/paper.pdf},
year = {2016},
date = {2016-10-01},
urldate = {2016-10-01},
booktitle = {Workshop and Challenges on Modeling and Monitoring of Computer Assisted Interventions (M2CAI), Held in Conjunction with International Conference on Medical Image Computing and Computer Assisted Intervention (MICCAI)},
address = {Athens, Greece},
abstract = {Understanding surgical workflow has been a key concern of the medical research community. One of the main advantages of surgical workflow detection is real time operating room (OR) scheduling. For hospitals, each minute of OR time is important in order to reduce cost and increase patient throughput. Traditional approaches in this field generally tackle the video analysis using hand crafted video features to facilitate the tool detection. Recently, Twinanda et al presented a CNN architecture ’EndoNet’ which outperformed previous methods for both surgical tool detection and surgical phase detection. Given the recent success of these networks, we present a study of various architectures coupled with a submission to the M2CAI Surgical Tool Detection challenge. We achieved a top-3 result for the M2CAI competition with a mAP of 37.6.
},
keywords = {activity assessment, computer vision, MICCAI, surgical training},
pubstate = {published},
tppubtype = {inproceedings}
}
Vinay Bettadapura, Caroline Pantofaru, Irfan Essa
Leveraging Contextual Cues for Generating Basketball Highlights Proceedings Article
In: ACM International Conference on Multimedia (ACM-MM), ACM 2016.
Abstract | Links | BibTeX | Tags: ACM, ACMMM, activity recognition, computational video, computer vision, sports visualization, video summarization
@inproceedings{2016-Bettadapura-LCCGBH,
title = {Leveraging Contextual Cues for Generating Basketball Highlights},
author = {Vinay Bettadapura and Caroline Pantofaru and Irfan Essa},
url = {https://dl.acm.org/doi/10.1145/2964284.2964286
http://www.vbettadapura.com/highlights/basketball/index.htm},
doi = {10.1145/2964284.2964286},
year = {2016},
date = {2016-10-01},
urldate = {2016-10-01},
booktitle = {ACM International Conference on Multimedia (ACM-MM)},
organization = {ACM},
abstract = {The massive growth of sports videos has resulted in a need for automatic generation of sports highlights that are comparable in quality to the hand-edited highlights produced by broadcasters such as ESPN. Unlike previous works that mostly use audio-visual cues derived from the video, we propose an approach that additionally leverages contextual cues derived from the environment that the game is being played in. The contextual cues provide information about the excitement levels in the game, which can be ranked and selected to automatically produce high-quality basketball highlights. We introduce a new dataset of 25 NCAA games along with their play-by-play stats and the ground-truth excitement data for each basket. We explore the informativeness of five different cues derived from the video and from the environment through user studies. Our experiments show that for our study participants, the highlights produced by our system are comparable to the ones produced by ESPN for the same games.},
keywords = {ACM, ACMMM, activity recognition, computational video, computer vision, sports visualization, video summarization},
pubstate = {published},
tppubtype = {inproceedings}
}
Daniel Castro, Vinay Bettadapura, Irfan Essa
Discovering Picturesque Highlights from Egocentric Vacation Video Proceedings Article
In: IEEE Winter Conference on Applications of Computer Vision (WACV), 2016.
Abstract | Links | BibTeX | Tags: computational photography, computational video, computer vision, WACV
@inproceedings{2016-Castro-DPHFEVV,
title = {Discovering Picturesque Highlights from Egocentric Vacation Video},
author = {Daniel Castro and Vinay Bettadapura and Irfan Essa},
url = {https://ieeexplore.ieee.org/document/7477707
http://www.cc.gatech.edu/cpl/projects/egocentrichighlights/
https://youtu.be/lIONi21y-mk},
doi = {10.1109/WACV.2016.7477707},
year = {2016},
date = {2016-03-01},
urldate = {2016-03-01},
booktitle = {IEEE Winter Conference on Applications of Computer Vision (WACV)},
abstract = {We present an approach for identifying picturesque highlights from large amounts of egocentric video data. Given a set of egocentric videos captured over the course of a vacation, our method analyzes the videos and looks for images that have good picturesque and artistic properties. We introduce novel techniques to automatically determine aesthetic features such as composition, symmetry and color vibrancy in egocentric videos and rank the video frames based on their photographic qualities to generate highlights. Our approach also uses contextual information such as GPS, when available, to assess the relative importance of each geographic location where the vacation videos were shot. Furthermore, we specifically leverage the properties of egocentric videos to improve our highlight detection. We demonstrate results on a new egocentric vacation dataset which includes 26.5 hours of videos taken over a 14 day vacation that spans many famous tourist destinations and also provide results from a user-study to access our results.
},
keywords = {computational photography, computational video, computer vision, WACV},
pubstate = {published},
tppubtype = {inproceedings}
}
Daniel Castro, Steven Hickson, Vinay Bettadapura, Edison Thomaz, Gregory Abowd, Henrik Christensen, Irfan Essa
Predicting Daily Activities from Egocentric Images Using Deep Learning Proceedings Article
In: Proceedings of International Symposium on Wearable Computers (ISWC), 2015.
Abstract | Links | BibTeX | Tags: activity recognition, computer vision, ISWC, machine learning, wearable computing
@inproceedings{2015-Castro-PDAFEIUDL,
title = {Predicting Daily Activities from Egocentric Images Using Deep Learning},
author = {Daniel Castro and Steven Hickson and Vinay Bettadapura and Edison Thomaz and Gregory Abowd and Henrik Christensen and Irfan Essa},
url = {https://dl.acm.org/doi/10.1145/2802083.2808398
https://arxiv.org/abs/1510.01576
http://www.cc.gatech.edu/cpl/projects/dailyactivities/
},
doi = {10.1145/2802083.2808398},
year = {2015},
date = {2015-09-01},
urldate = {2015-09-01},
booktitle = {Proceedings of International Symposium on Wearable Computers (ISWC)},
abstract = {We present a method to analyze images taken from a passive egocentric wearable camera along with contextual information, such as time and day of the week, to learn and predict the everyday activities of an individual. We collected a dataset of 40,103 egocentric images over 6 months with 19 activity classes and demonstrate the benefit of state-of-the-art deep learning techniques for learning and predicting daily activities. Classification is conducted using a Convolutional Neural Network (CNN) with a classification method we introduce called a late fusion ensemble. This late fusion ensemble incorporates relevant contextual information and increases our classification accuracy. Our technique achieves an overall accuracy of 83.07% in predicting a person's activity across the 19 activity classes. We also demonstrate some promising results from two additional users by fine-tuning the classifier with one day of training data.},
keywords = {activity recognition, computer vision, ISWC, machine learning, wearable computing},
pubstate = {published},
tppubtype = {inproceedings}
}
Other Publication Sites
A few more sites that aggregate research publications: Academic.edu, Bibsonomy, CiteULike, Mendeley.
Copyright/About
[Please see the Copyright Statement that may apply to the content listed here.]
This list of publications is produced by using the teachPress plugin for WordPress.