diff --git a/abstract.tex b/abstract.tex index efa8e02..63d9fbe 100644 --- a/abstract.tex +++ b/abstract.tex @@ -1,18 +1,18 @@ \begin{abstract} -Many state of the art energy-minimization approaches to optical flow and scene -flow estimation rely on a rigid scene model, where the scene is -represented as an ensemble of distinct, rigidly moving components, a static -background and a moving camera. -By constraining the optimization problem with a physically sound scene model, -these approaches enable state-of-the art motion estimation. +% Many state of the art energy-minimization approaches to optical flow and scene +% flow estimation rely on a rigid scene model, where the scene is +% represented as an ensemble of distinct, rigidly moving components, a static +% background and a moving camera. +% By constraining the optimization problem with a physically sound scene model, +% these approaches enable state-of-the art motion estimation. With the advent of deep learning methods, it has become popular to re-purpose generic deep networks for classical computer vision problems involving pixel-wise estimation. Following this trend, many recent end-to-end deep learning approaches to optical -flow and scene flow directly predict full resolution flow fields with +flow and scene flow predict full resolution flow fields with a generic network for dense, pixel-wise prediction, thereby ignoring the inherent structure of the underlying motion estimation problem and any physical constraints within the scene. @@ -20,7 +20,8 @@ constraints within the scene. We introduce a scalable end-to-end deep learning approach for dense motion estimation that respects the structure of the scene as being composed of distinct objects, thus combining the representation learning benefits and speed of end-to-end deep networks -with a physically plausible scene model. +with a physically plausible scene model inspired by slanted plane energy-minimization approaches to +scene flow. Building on recent advanced in region-based convolutional networks (R-CNNs), we integrate motion estimation with instance segmentation. @@ -29,7 +30,8 @@ our resulting end-to-end deep network detects objects with accurate per-pixel masks and estimates the 3D motion of each detected object between the frames. By additionally estimating a global camera motion in the same network, we compose a dense optical flow field based on instance-level and global motion -predictions. +predictions. Our network is trained on the synthetic Virtual KITTI dataset, +which provides ground truth for all components of the system. \end{abstract} diff --git a/bib.bib b/bib.bib index 02c09bf..dbc3fa7 100644 --- a/bib.bib +++ b/bib.bib @@ -77,10 +77,10 @@ booktitle = {{NIPS}}, year = {2012}} -@article{VGGNet, +@inproceedings{VGGNet, author = {Karen Simonyan and Andrew Zisserman}, title = {Very Deep Convolutional Networks for Large-Scale Image Recognition}, - journal = {ICLR}, + booktitle = {ICLR}, year = {2015}} @inproceedings{ResNet, @@ -89,10 +89,10 @@ booktitle = {CVPR}, year = {2016}} -@article{DenseNetDenseFlow, +@inproceedings{DenseNetDenseFlow, author = {Yi Zhu and Shawn D. Newsam}, title = {DenseNet for Dense Flow}, - journal = {ICIP}, + booktitle = {ICIP}, year = {2017}} @inproceedings{SE3Nets, @@ -149,10 +149,10 @@ booktitle = {{CVPR}}, year = {2017}} -@article{SPyNet, +@inproceedings{SPyNet, author = {Anurag Ranjan and Michael J. Black}, title = {Optical Flow Estimation using a Spatial Pyramid Network}, - journal = {CVPR}, + booktitle = {CVPR}, year = {2017}} @inproceedings{FPN,