diff --git a/.gitignore b/.gitignore index 9682ad1..a61c995 100644 --- a/.gitignore +++ b/.gitignore @@ -6,8 +6,6 @@ thesis.brf thesis.pdf thesis.synctex.gz thesis.log -external.bib -papers.bib short.bib thesis.dvi q.log diff --git a/abstract.tex b/abstract.tex index bcb1cb1..699a12a 100644 --- a/abstract.tex +++ b/abstract.tex @@ -28,8 +28,8 @@ we integrate motion estimation with instance segmentation. Given two consecutive frames from a monocular RGB-D camera, our resulting end-to-end deep network detects objects with precise per-pixel object masks and estimates the 3D motion of each detected object between the frames. -By additionally estimating the camera ego-motion in the same network, -we compose a dense optical flow field based on instance-level and global motion +Additionally, we estimate the camera ego-motion in the same network, +and compose a dense optical flow field based on instance-level and global motion predictions. We train our network on the synthetic Virtual KITTI dataset, which provides ground truth for all components of our system. @@ -62,8 +62,8 @@ Networks (R-CNNs) auf und integrieren Bewegungsschätzung mit Instanzsegmentieru Bei Eingabe von zwei aufeinanderfolgenden Frames aus einer monokularen RGB-D Kamera erkennt unser end-to-end Deep Network Objekte mit pixelgenauen Objektmasken und schätzt die 3D-Bewegung jedes erkannten Objekts zwischen den Frames ab. -Indem wir zusätzlich im selben Netzwerk die Eigenbewerung der Kamera schätzen, -setzen wir aus den instanzbasierten und globalen Bewegungsschätzungen ein dichtes +Zusätzlich schätzen wir im selben Netzwerk die Eigenbewegung der Kamera, +und setzen aus den instanzbasierten und globalen Bewegungsschätzungen ein dichtes optisches Flussfeld zusammen. Wir trainieren unser Netzwerk auf dem synthetischen Virtual KITTI Datensatz, der Ground Truth für alle Komponenten unseres Systems bereitstellt. diff --git a/approach.tex b/approach.tex index 158c0dd..00ee3ae 100644 --- a/approach.tex +++ b/approach.tex @@ -255,7 +255,7 @@ performs better in our case than the standard $\ell_1$-loss. We thus compute the RoI motion loss as \begin{equation} -L_{motion} = \frac{1}{N_{RoI}^{fg}} \sum_k^{N_{RoI}} l_{p}^k + (l_{R}^k + l_{t}^k) \cdot o_k^* + l_o^k, +\text{L}_{motion} = \frac{1}{\text{N}_{RoI}^{\mathit{fg}}} \sum_k^{\text{N}_{RoI}} l_{p}^k + (l_{R}^k + l_{t}^k) \cdot o_k^* + l_o^k, \end{equation} where \begin{equation} @@ -284,7 +284,7 @@ other than $c_k^*$ are not penalized. Now, our modified RoI loss is \begin{equation} -L_{RoI} = L_{cls} + L_{box} + L_{mask} + L_{motion}. +\text{L}_{RoI} = \text{L}_{cls} + \text{L}_{box} + \text{L}_{mask} + \text{L}_{motion}. \end{equation} \paragraph{Camera motion supervision} @@ -339,7 +339,7 @@ full image resolution, as the depth crops and 2D point grid are at the same resolution as the predicted $m \times m$ mask. -For each RoI, we can now compute $L_{RoI}$ and thus supervise the object motion +For each RoI, we can now compute $\text{L}_{RoI}$ and thus supervise the object motion by penalizing the $m \times m$ optical flow grid. If there is optical flow ground truth available, we can use the RoI bounding box to crop and resize a region from the ground truth optical flow to match the RoI's diff --git a/background.tex b/background.tex index 1698d21..64a2b39 100644 --- a/background.tex +++ b/background.tex @@ -105,7 +105,7 @@ image brightness differences penalizes the predictions. & Conv-Deconv & H $\times$ W $\times$ 32 \\ masks & 1 $\times$1 conv, N$_{motions}$ & H $\times$ W $\times$ N$_{motions}$ \\ FC & From bottleneck: $\begin{bmatrix}\textrm{fully connected}, 512\end{bmatrix}$ $\times$ 2 & 1 $\times$ 512 \\ -object motions & fully connected, $N_{motions} \cdot$ 9 & H $\times$ W $\times$ $N_{motions} \cdot$ 9 \\ +object motions & fully connected, $\text{N}_{motions} \cdot$ 9 & H $\times$ W $\times$ $\text{N}_{motions} \cdot$ 9 \\ camera motion & From FC: $\times$ 2 & H $\times$ W $\times$ 6 \\ \midrule \multicolumn{3}{c}{\textbf{Structure Network}}\\ @@ -276,10 +276,10 @@ C$_4$ & ResNet \{up to C$_4$\} (Table \ref{table:resnet}) & $\tfrac{1}{16}$ H $ \multicolumn{3}{c}{\textbf{Region Proposal Network (RPN)}}\\ \midrule R$_0$ & From C$_4$: 1 $\times$ 1 conv, 512 & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 512 \\ -& 1 $\times$ 1 conv, 4 & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ $N_a \cdot$ 4 \\ +& 1 $\times$ 1 conv, 4 & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ $\text{N}_a \cdot$ 4 \\ & flatten & A $\times$ 4 \\ boxes$_{\mathrm{RPN}}$ & decode bounding boxes (Eq. \ref{eq:pred_bounding_box}) & A $\times$ 4\\ -& From R$_0$: 1 $\times$ 1 conv, 2 & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ $N_a \cdot$ 2 \\ +& From R$_0$: 1 $\times$ 1 conv, 2 & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ $\text{N}_a \cdot$ 2 \\ & flatten & A $\times$ 2 \\ scores$_{\mathrm{RPN}}$& softmax & A $\times$ 2 \\ ROI$_{\mathrm{RPN}}$ & sample boxes$_{\mathrm{RPN}}$ and scores$_{\mathrm{RPN}}$ & N$_{RoI}$ $\times$ 6 \\ @@ -326,9 +326,9 @@ which is a deep feature encoder CNN with the original image as input. Next, the \emph{backbone} output features are passed into a small, fully-convolutional \emph{Region Proposal Network (RPN)} head, which predicts objectness scores and regresses bounding boxes at each of its output positions. At any of the $h \times w$ output positions of the RPN head, -$N_a$ bounding boxes with their objectness scores are predicted as offsets relative to a fixed set of $N_a$ \emph{anchors} with different -aspect ratios and scales. Thus, there are $N_a \times h \times w$ reference anchors in total. -In Faster R-CNN, $N_a = 9$, with 3 scales, corresponding +$\text{N}_a$ bounding boxes with their objectness scores are predicted as offsets relative to a fixed set of $\text{N}_a$ \emph{anchors} with different +aspect ratios and scales. Thus, there are $\text{N}_a \times h \times w$ reference anchors in total. +In Faster R-CNN, $\text{N}_a = 9$, with 3 scales, corresponding to anchor boxes of areas of $\{128^2, 256^2, 512^2\}$ pixels and 3 aspect ratios, $\{1:2, 1:1, 2:1\}$. For the ResNet Faster R-CNN backbone, we generally have a stride of 16 with respect to the input image at the RPN output (Table \ref{table:maskrcnn_resnet}). @@ -388,7 +388,7 @@ P$_6$ & From P$_5$: 2 $\times$ 2 subsample, 256 & $\tfrac{1}{64}$ H $\times$ $\t \midrule \multicolumn{3}{c}{$\forall i \in \{2...6\}$}\\ & From P$_i$: 1 $\times$ 1 conv, 512 & $\tfrac{1}{2^i}$ H $\times$ $\tfrac{1}{2^i}$ W $\times$ 512 \\ -& 1 $\times$ 1 conv, 6 & $\tfrac{1}{2^i}$ H $\times$ $\tfrac{1}{2^i}$ W $\times$ $N_a \cdot$ 6 \\ +& 1 $\times$ 1 conv, 6 & $\tfrac{1}{2^i}$ H $\times$ $\tfrac{1}{2^i}$ W $\times$ $\text{N}_a \cdot$ 6 \\ RPN$_i$& flatten & A$_i$ $\times$ 6 \\ \midrule & From \{RPN$_2$ ... RPN$_6$\}: concatenate & A $\times$ 6 \\ @@ -440,7 +440,7 @@ from the bottleneck. Instead of a single RPN head with anchors at 3 scales and 3 aspect ratios, the FPN variant has one RPN head after each of the pyramid levels P$_2$ ... P$_6$ (see Table \ref{table:maskrcnn_resnet_fpn}). At each output position of the resulting RPN pyramid, bounding boxes are predicted -with respect to 3 anchor aspect ratios $\{1:2, 1:1, 2:1\}$ and a single scale ($N_a = 3$). +with respect to 3 anchor aspect ratios $\{1:2, 1:1, 2:1\}$ and a single scale ($\text{N}_a = 3$). For P$_2$, P$_3$, P$_4$, P$_5$, P$_6$, the scale corresponds to anchor bounding boxes of areas $32^2, 64^2, 128^2, 256^2, 512^2$, respectively. @@ -501,14 +501,21 @@ frequently in the following chapters. For vector or tuple arguments, the sum of losses is computed. For classification with mutually exclusive classes, we define the categorical (softmax) cross-entropy loss, \begin{equation} -\ell_{cls}(c, c^*) = -\log(), +\ell_{cls}(c, c^*) = -\log(c_{c^*}), \end{equation} -where $c^* \in \{0,N_{cls}\}$ is a label (or vector of labels) and $c \in (0,1)$ is the output of a softmax layer. % TODO label has wrong range +where $c^* \in \{0,\text{C}\}$ is a ground truth label, +$c$ is the output vector from a softmax layer, +$c_{c^*} \in (0,1)$ is the output probability for class $c^*$, +and $\text{C}$ is the number of classes. +Note that for the object category classifier, $\text{C} = \text{N}_{cls} + 1$, +as $\text{N}_{cls}$ does not include the background class. Finally, for multi-label classification, we define the binary (sigmoid) cross-entropy loss, \begin{equation} \ell_{cls*}(y, y^*) = -y^* \cdot \log(y) - (1 - y^*) \cdot \log(1 - y), \end{equation} -where $y^* \in \{0,1\}$ is a label (or vector of labels) and $y \in (0,1)$ is the output of a sigmoid layer. +where $y^* \in \{0,1\}$ is a label and $y \in (0,1)$ is the output from a sigmoid layer. +Note that for the mask loss that will be introduced below, $\ell_{cls*}$ is +the sum of the $\ell_{cls*}$-losses for all 2D positions in the mask. \label{ssec:rcnn_techn} \paragraph{Bounding box regression} @@ -575,7 +582,7 @@ the predicted relative offsets and scales encoded in $b_e$. \paragraph{Supervision of the RPN} A positive RPN proposal is defined as one with a IoU of at least $0.7$ with -a ground truth bounding box. For training the RPN, $N_{RPN} = 256$ positive and negative +a ground truth bounding box. For training the RPN, $\text{N}_{RPN} = 256$ positive and negative examples are randomly sampled from the set of all RPN proposals, with at most $50\%$ positive examples (if there are less positive examples, more negative examples are used instead). @@ -587,18 +594,18 @@ it is negative, let $s_i$ be the predicted objectness score and $b_i$, $b_i^*$ t predicted and ground truth bounding box encodings. Then, the RPN loss is computed as \begin{equation} -L_{RPN} = L_{obj} + L_{box}^{RPN}, +\text{L}_{RPN} = \text{L}_{obj} + \text{L}_{box}^{RPN}, \end{equation} where \begin{equation} -L_{obj} = \frac{1}{N_{RPN}} \sum_{i=1}^{N_{RPN}} \ell_{cls}(s_i, s_i^*), +\text{L}_{obj} = \frac{1}{\text{N}_{RPN}} \sum_{i=1}^{\text{N}_{RPN}} \ell_{cls}(s_i, s_i^*), \end{equation} \begin{equation} -L_{box}^{RPN} = \frac{1}{N_{RPN}^{pos}} \sum_{i=1}^{N_{RPN}} s_i^* \cdot \ell_{reg}(b_i^* - b_i), +\text{L}_{box}^{RPN} = \frac{1}{\text{N}_{RPN}^{pos}} \sum_{i=1}^{\text{N}_{RPN}} s_i^* \cdot \ell_{reg}(b_i^* - b_i), \end{equation} and \begin{equation} -N_{RPN}^{pos} = \sum_{i=1}^{N_{RPN}} s_i^* +\text{N}_{RPN}^{pos} = \sum_{i=1}^{\text{N}_{RPN}} s_i^* \end{equation} is the number of positive examples. Note that the bounding box loss is only active for positive examples, and that the classification loss is computed @@ -612,7 +619,7 @@ one with a maximum IoU in $[0.1, 0.5)$. A total of 64 (without FPN) or 512 (with FPN) RoIs are sampled, with at most $25\%$ foreground examples. Now, let $c_i^*$ be the ground truth object class, where $c_i = 0$ -for background examples and $c_i \in \{1, ..., N_{cls}\}$ for foreground examples, +for background examples and $c_i \in \{1, ..., \text{N}_{cls}\}$ for foreground examples, and let $c_i$ be the class prediction. Then, for any foreground RoI, let $b_i^*$ be the ground truth bounding box encoding and $b_i$ the predicted refined box encoding for class $c_i^*$. @@ -623,23 +630,23 @@ In our implementation, we use nearest neighbour resizing for resizing the mask targets. Then, the ROI loss is computed as \begin{equation} -L_{RoI} = L_{cls} + L_{box} + L_{mask} +\text{L}_{RoI} = \text{L}_{cls} + \text{L}_{box} + \text{L}_{mask} \end{equation} where \begin{equation} -L_{cls} = \frac{1}{N_{RoI}} \sum_{i=1}^{N_{RoI}} \ell_{cls}(c_i, c_i^*), +\text{L}_{cls} = \frac{1}{\text{N}_{RoI}} \sum_{i=1}^{\text{N}_{RoI}} \ell_{cls}(c_i, c_i^*), \end{equation} is the average cross-entropy classification loss, \begin{equation} -L_{box} = \frac{1}{N_{RoI}^{fg}} \sum_{i=1}^{N_{RoI}} [c_i^* \geq 1] \cdot \ell_{reg}(b_i^* - b_i) +\text{L}_{box} = \frac{1}{\text{N}_{RoI}^{\mathit{fg}}} \sum_{i=1}^{\text{N}_{RoI}} [c_i^* \geq 1] \cdot \ell_{reg}(b_i^* - b_i) \end{equation} is the average smooth-$\ell_1$ bounding box regression loss, \begin{equation} -L_{mask} = \frac{1}{N_{RoI}^{fg}} \sum_{i=1}^{N_{RoI}} [c_i^* \geq 1] \cdot \ell_{cls*}(m_i,m_i^*) +\text{L}_{mask} = \frac{1}{\text{N}_{RoI}^{\mathit{fg}}} \sum_{i=1}^{\text{N}_{RoI}} [c_i^* \geq 1] \cdot \ell_{cls*}(m_i,m_i^*) \end{equation} is the average binary cross-entropy mask loss, \begin{equation} -N_{RoI}^{fg} = \sum_{i=1}^{N_{RoI}} [c_i^* \geq 1] +\text{N}_{RoI}^{\mathit{fg}} = \sum_{i=1}^{\text{N}_{RoI}} [c_i^* \geq 1] \end{equation} is the number of foreground examples, and \begin{equation} diff --git a/bib.bib b/bib.bib index 7cee841..c4dddd7 100644 --- a/bib.bib +++ b/bib.bib @@ -4,15 +4,19 @@ Vladimir Golkov and Patrick v.d. Smagt and Daniel Cremers and Thomas Brox}, title = {{FlowNet}: Learning Optical Flow with Convolutional Networks}, booktitle = iccv-2015, + Pages = {2758--2766}, year = iccv-2015-yr, + address = iccv-2015-adr, month = iccv-2015-mon} @inproceedings{FlowNet2, author = {Eddy Ilg and Nikolaus Mayer and Tonmoy Saikia and Margret Keuper and Alexey Dosovitskiy and Thomas Brox}, title = {{FlowNet} 2.0: {E}volution of Optical Flow Estimation with Deep Networks}, + Pages = {1647--1655}, booktitle = cvpr-2017, year = cvpr-2017-yr, + address = cvpr-2017-adr, month = cvpr-2017-mon} @inproceedings{SceneFlowDataset, @@ -21,7 +25,9 @@ title = {A Large Dataset to Train Convolutional Networks for Disparity, Optical Flow, and Scene Flow Estimation}, booktitle = cvpr-2016, + Pages = {4040--4048}, year = cvpr-2016-yr, + address = cvpr-2016-adr, month = cvpr-2016-mon} @article{SfmNet, @@ -37,9 +43,10 @@ @inproceedings{MaskRCNN, Author = {Kaiming He and Georgia Gkioxari and Piotr Doll\'{a}r and Ross Girshick}, - Title = {{Mask {R-CNN}}}, + Title = {Mask {R-CNN}}, Booktitle = cvpr-2017, Year = cvpr-2017-yr, + address = cvpr-2017-adr, month = cvpr-2017-mon} @inproceedings{FasterRCNN, @@ -47,14 +54,20 @@ Ross Girshick and Jian Sun}, Title = {Faster {R-CNN}: Towards Real-Time Object Detection with Region Proposal Networks}, - Booktitle = nips-2015, - Year = nips-2015-yr} + booktitle = nips-2015, + year = nips-2015-yr, + Editor = nips-2015-eds, + pages = {91--99}, + Pages = {2017--2025}, + Volume = nips-2015-vol} @inproceedings{FastRCNN, Author = {Ross Girshick}, Title = {Fast {R-CNN}}, Booktitle = iccv-2015, + pages = {1440--1448}, year = iccv-2015-yr, + address = iccv-2015-adr, month = iccv-2015-mon} @inproceedings{InstanceSceneFlow, @@ -65,6 +78,7 @@ in Autonomous Driving Scenarios?}, Booktitle = iccv-2017, year = iccv-2017-yr, + address = iccv-2017-adr, month = iccv-2017-mon} @inproceedings{RCNN, @@ -76,12 +90,17 @@ object detection and semantic segmentation}, Booktitle = cvpr-2014, Year = cvpr-2014-yr, + address = cvpr-2014-adr, + pages = {580--587}, month = cvpr-2014-mon} @inproceedings{ImageNetCNN, title = {ImageNet Classification with Deep Convolutional Neural Networks}, author = {Alex Krizhevsky and Sutskever, Ilya and Hinton, Geoffrey E.}, booktitle = nips-2012, + Editor = nips-2012-eds, + Volume = nips-2012-vol, + pages = {1097--1105}, year = nips-2012-yr} @inproceedings{VGGNet, @@ -95,18 +114,24 @@ title = {Deep Residual Learning for Image Recognition}, booktitle = cvpr-2016, year = cvpr-2016-yr, + pages = {770--778}, + address = cvpr-2016-adr, month = cvpr-2016-mon} @inproceedings{DenseNetDenseFlow, author = {Yi Zhu and Shawn D. Newsam}, title = {DenseNet for Dense Flow}, booktitle = icip-2017, + month = icip-2017-mon, + address = icip-2017-adr, year = icip-2017-yr} @inproceedings{SE3Nets, author = {Arunkumar Byravan and Dieter Fox}, title = {{SE3-Nets}: Learning Rigid Body Motion using Deep Neural Networks}, booktitle = {Proceedings of the IEEE International Conference on Robotics and Automation}, + pages = {173--180}, + address = "Singapore, Singapore", year = {2017}} @inproceedings{FlowLayers, @@ -114,19 +139,29 @@ title = {Optical Flow with Semantic Segmentation and Localized Layers}, booktitle = cvpr-2016, year = cvpr-2016-yr, + pages = {3889--3898}, + address = cvpr-2016-adr, month = cvpr-2016-mon} @inproceedings{ESI, author = {Min Bai and Wenjie Luo and Kaustav Kundu and Raquel Urtasun}, title = {Exploiting Semantic Information and Deep Matching for Optical Flow}, booktitle = eccv-2016, - year = eccv-2016-yr} + year = eccv-2016-yr, + Series = eccv-2016-ser, + Editor = eccv-2016-eds, + Pages = {154--170}, + Publisher = eccv-2016-pub, + Volume = eccv-2016-vol6, + Sortmonth = eccv-2016-srtmon} @inproceedings{VKITTI, author = {Adrien Gaidon and Qiao Wang and Yohann Cabon and Eleonora Vig}, title = {Virtual Worlds as Proxy for Multi-Object Tracking Analysis}, booktitle = cvpr-2016, year = cvpr-2016-yr, + address = cvpr-2016-adr, + pages = {4340--4349}, month = cvpr-2016-mon} @inproceedings{KITTI2012, @@ -134,26 +169,36 @@ title = {Are we ready for Autonomous Driving? The {KITTI} Vision Benchmark Suite}, booktitle = cvpr-2012, year = cvpr-2012-yr, + pages = {3354--3361}, + address = cvpr-2012-adr, month = cvpr-2012-mon} @inproceedings{KITTI2015, author = {Moritz Menze and Andreas Geiger}, title = {Object Scene Flow for Autonomous Vehicles}, booktitle = cvpr-2015, + pages = {3061--3070}, year = cvpr-2015-yr, + address = cvpr-2015-adr, month = cvpr-2015-mon} @inproceedings{PRSF, author = {C. Vogel and K. Schindler and S. Roth}, title = {Piecewise Rigid Scene Flow}, booktitle = iccv-2013, + address = iccv-2013-adr, year = iccv-2013-yr, + Pages = {1377--1384}, month = iccv-2013-mon} @article{PRSM, author = {C. Vogel and K. Schindler and S. Roth}, - title = {3D Scene Flow with a Piecewise Rigid Scene Model}, - booktitle = ijcv, + title = {{3D} Scene Flow with a Piecewise Rigid Scene Model}, + journal = ijcv, + Month = oct, + Number = {1}, + Pages = {1--28}, + Volume = {115}, year = {2015}} @inproceedings{MRFlow, @@ -161,6 +206,8 @@ title = {Optical Flow in Mostly Rigid Scenes}, booktitle = cvpr-2017, year = cvpr-2017-yr, + address = cvpr-2017-adr, + Pages = {6911--6920}, month = cvpr-2017-mon} @inproceedings{SPyNet, @@ -168,6 +215,8 @@ title = {Optical Flow Estimation using a Spatial Pyramid Network}, booktitle = cvpr-2017, year = cvpr-2017-yr, + address = cvpr-2017-adr, + Pages = {2720--2729}, month = cvpr-2017-mon} @inproceedings{FPN, @@ -175,6 +224,8 @@ title = {Feature Pyramid Networks for Object Detection}, booktitle = cvpr-2017, year = cvpr-2017-yr, + Pages = {936--944}, + address = cvpr-2017-adr, month = cvpr-2017-mon} @inproceedings{PoseNet, @@ -182,6 +233,8 @@ title = {PoseNet: A Convolutional Network for Real-Time 6-DOF Camera Relocalization}, booktitle = iccv-2015, year = iccv-2015-yr, + Pages = {2938--2946}, + address = iccv-2015-adr, month = iccv-2015-mon} @inproceedings{PoseNet2, @@ -189,24 +242,36 @@ title = {Geometric loss functions for camera pose regression with deep learning}, booktitle = cvpr-2017, year = cvpr-2017-yr, + address = cvpr-2017-adr, + Pages = {6555--6564}, month = cvpr-2017-mon} @inproceedings{STN, author = {M. Jadeberg and K. Zisserman and K. Kavukcuoglu}, title = {Spatial transformer networks}, booktitle = nips-2015, - year = nips-2015-yr} + year = nips-2015-yr, + Editor = nips-2015-eds, + Pages = {2017--2025}, + Volume = nips-2015-vol} @inproceedings{CensusTerm, author = {Fridtjof Stein}, title = {Efficient Computation of Optical Flow Using the Census Transform}, booktitle = dagm-2004, - year = dagm-2004-yr} + pages={79--86}, + year = dagm-2004-yr, + volume = dagm-2004-vol, + series = dagm-2004-ser, + publisher = dagm-2004-pub, + editor = dagm-2004-eds + } @inproceedings{DeeperDepth, author = {Iro Laina and Christian Rupprecht and Vasileios Belagiannis and Federico Tombari and Nassir Navab}, title = {Deeper Depth Prediction with Fully Convolutional Residual Networks}, booktitle = {Proceedings of the International Conference on 3D Vision}, + pages={239--248}, year = {2016}} @inproceedings{TensorFlowObjectDetection, @@ -215,6 +280,8 @@ title = {Speed/accuracy trade-offs for modern convolutional object detectors}, booktitle = cvpr-2017, year = cvpr-2017-yr, + address = cvpr-2017-adr, + Pages = {3296--3297}, month = cvpr-2017-mon} @misc{TensorFlow, @@ -224,45 +291,64 @@ author={Martín Abadi and others}, year={2015}} -@inproceedings{LSTM, +@article{LSTM, author = {Sepp Hochreiter and Jürgen Schmidhuber}, title = {Long Short-Term Memory}, - booktitle = neco, + journal = neco, + volume = {9}, + number = {8}, + month = nov, + pages={1735--1780}, year = {1997}} @inproceedings{TemporalSF, author = {Christoph Vogel and Stefan Roth and Konrad Schindler}, - title = {View-Consistent 3D Scene Flow Estimation over Multiple Frames}, + title = {View-Consistent {3D} Scene Flow Estimation over Multiple Frames}, booktitle = eccv-2014, + Series = eccv-2014-ser, + Editor = eccv-2014-eds, + Pages = {263--278}, + Publisher = eccv-2014-pub, + Volume = eccv-2014-vol4, + Sortmonth = eccv-2014-srtmon, year = eccv-2014-yr} @inproceedings{Cityscapes, author = {M. Cordts and M. Omran and S. Ramos and T. Rehfeld and M. Enzweiler and R. Benenson and U. Franke and S. Roth and B. Schiele}, - title = {The Cityscapes Dataset for Semantic Urban Scene Understanding}, + title = {The {C}ityscapes Dataset for Semantic Urban Scene Understanding}, booktitle = cvpr-2016, year = cvpr-2016-yr, + Pages = {3213--3223}, + address = cvpr-2016-adr, month = cvpr-2016-mon} -@inproceedings{SGD, +@article{SGD, author = {Y. LeCun and B. Boser and J. S. Denker and D. Henderson and R. E. Howard and W. Hubbard and L. D. Jackel}, title = {Backpropagation applied to handwritten zip code recognition}, - booktitle = neco, + volume={1}, + number={4}, + pages={541-551}, + journal = neco, year = {1989}} @inproceedings{GCNet, author = {Alex Kendall and Hayk Martirosyan and Saumitro Dasgupta and Peter Henry Ryan Kennedy and Abraham Bachrach and Adam Bry}, title = {End-to-End Learning of Geometry and Context for Deep Stereo Regression}, - booktitle = cvpr-2017, - year = cvpr-2017-yr, - month = cvpr-2017-mon} + booktitle = iccv-2017, + year = iccv-2017-yr, + address = iccv-2017-adr, + month = iccv-2017-mon} @inproceedings{BN, author = {Sergey Ioffe and Christian Szegedy}, title = {Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift}, booktitle = icml-2015, + address = icml-2015-adr, + month = icml-2015-mon, + Pages = {448--456}, year = icml-2015-yr} @inproceedings{He, @@ -270,35 +356,58 @@ title = {Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification}, booktitle = iccv-2015, year = iccv-2015-yr, + Pages = {1026--1034}, + address = iccv-2015-adr, month = iccv-2015-mon} @inproceedings{UnFlow, author = {Simon Meister and Junhwa Hur and Stefan Roth}, title = {UnFlow: Unsupervised Learning of Optical Flow with a Bidirectional Census Loss}, booktitle = aaai-2018, + address = aaai-2018-adr, + month = aaai-2018-mon, + Note = {to appear}, year = aaai-2018-yr} @inproceedings{UnsupDepth, title={Unsupervised Learning of Depth and Ego-Motion from Video}, author={Ravi Garg and BG Vijay Kumar and Gustavo Carneiro and Ian Reid}, - booktitle=eccv-2016, - year=eccv-2016-yr} + pages={6612-6619}, + booktitle = cvpr-2017, + year = cvpr-2017-yr, + address = cvpr-2017-adr, + month = cvpr-2017-mon} @inproceedings{UnsupPoseDepth, title={Unsupervised CNN for single view depth estimation: Geometry to the rescue}, author={Tinghui Zhou and Matthew Brown and Noah Snavely and David G. Lowe}, - booktitle=cvpr-2017, - year = cvpr-2017-yr, - month = cvpr-2017-mon} + booktitle=eccv-2016, + year=eccv-2016-yr, + Series = eccv-2016-ser, + Editor = eccv-2016-eds, + Pages = {740--756}, + Publisher = eccv-2016-pub, + Volume = eccv-2016-vol8, + Sortmonth = eccv-2016-srtmon} @inproceedings{UnsupFlownet, title={Back to Basics: Unsupervised Learning of Optical Flow via Brightness Constancy and Motion Smoothness}, author={Jason J. Yu and Adam W. Harley and Konstantinos G. Derpanis}, booktitle={ECCV 2016 Workshops}, + Pages = {3--10}, + Publisher = eccv-2016-pub, + Series = eccv-2016-ser, + Sortmonth = eccv-2016-srtmon, + Volume = eccv-2016-vol3, year={2016}} @article{ImageNet, title={ImageNet Large Scale Visual Recognition Challenge}, author={Olga Russakovsky and others}, - booktitle=ijcv, + month={Dec}, + day={01}, + volume={115}, + number={3}, + pages={211--252}, + journal=ijcv, year={2015}} diff --git a/conclusion.tex b/conclusion.tex index 34f98f9..8a73061 100644 --- a/conclusion.tex +++ b/conclusion.tex @@ -19,6 +19,7 @@ and achieves high accuracy in classifying between moving and non-moving objects, the accuracy of the motion predictions is still not convincing. More work will be thus required to bring the system (closer) to competetive accuracy, which includes trying penalization with the flow loss instead of 3D motion ground truth, +experimenting with the weighting between different loss terms, and improvements to the network architecture and training process. We thus presented a partial step towards real time 3D motion estimation based on a physically sound scene decomposition. Thanks to instance-level reasoning, in contrast diff --git a/experiments.tex b/experiments.tex index 8e97679..bfa1c5e 100644 --- a/experiments.tex +++ b/experiments.tex @@ -162,6 +162,8 @@ first 144K iterations and $0.25 \cdot 10^{-3}$ for all remaining iterations. For training the RPN and RoI heads and during inference, we use the exact same number of proposals and RoIs as Mask R-CNN in the ResNet and ResNet-FPN variants, respectively. +All losses are added up without additional weighting between the loss terms, +as in Mask R-CNN. \paragraph{Initialization} For initializing the C$_1$ to C$_5$ weights, we use a pre-trained @@ -226,7 +228,7 @@ Evaluation of different metrics on the Virtual KITTI validation set. AEE: Average Endpoint Error; Fl-all: Ratio of pixels where flow estimate is wrong by both $\geq 3$ pixels and $\geq 5\%$. We compare network variants with and without FPN. -Camera and instance motion errors are averaged over the validation set. +All metrics are averaged over all examples in the validation set. Quantities in parentheses in the first row are the average ground truth values for the estimated quantity. For example, we compare the error in camera angle, $E_{R}^{cam} [deg]$, to the average rotation angle in the ground truth camera motions. @@ -250,14 +252,17 @@ on the Virtual KITTI validation set. \paragraph{Camera motion} Both variants achieve a low error in predicted camera translation, relative to the average ground truth camera translation. The camera rotation angle error -is relatively high compared to the small average ground truth camera rotation. +is still relatively high, compared to the small average ground truth camera rotation. Although both variants use the exact same network for predicting the camera motion, the FPN variant performs worse here, with the error in rotation angle twice as high. One possible explanations that should be investigated in futher work is that in the FPN variant, all blocks in the backbone are shared between the camera motion branch and the feature pyramid. In the variant without FPN, the C$5$ and C$6$ blocks are only used in the camera branch, and thus only experience weight -updates due to the camera motion loss. +updates due to the camera motion loss. This could mean that the weight updates due +to the RoI head losses are detrimental to the camera motion estimation. +As a remedy, increasing the loss weighting of the camera motion loss may be +helpful. \paragraph{Instance motion} The object pivots are estimated with relatively (given that the scenes are in a realistic scale) @@ -266,13 +271,16 @@ precise, which we ascribe to the higher resolution features used in this variant The predicted 3D object translations and rotations still have a relatively high error, compared to the average actual (ground truth) translations and rotations, -which may be due to implementation issues or problems with the current 3D motion -ground truth loss. -The FPN variant is only slightly more accurate for these predictions, which suggests -that there may still be issues with our implementation, as one would expect the -FPN to be more accurate. +which may be due to implementation issues, a non-optimal network architecture, +or problems with the current 3D motion ground truth loss +(e.g., non-optimal weighting between the components of the motion loss, or between motion and classification losses). +Note that the relative error is higher for rotations, which is +also the case in the camera motion estimates. +The FPN variant is only slightly more accurate for these predictions, which again suggests +that there may still be issues with our loss design, loss weighting, or implementation, as one would expect the +FPN to yield more accurate motion estimates (as is the case for the pivot estimation). \paragraph{Instance segmentation} Looking at Figure \ref{figure:vkitti}, our instance segmentation results are in -many cases still lacking the accuracy seen in the Mask R-CNN Cityscapes \cite{MaskRCNN} results, +some cases still lacking the accuracy seen in the Mask R-CNN Cityscapes \cite{MaskRCNN} results, which is likely due to implementation details. diff --git a/introduction.tex b/introduction.tex index c4b9936..cc95d37 100644 --- a/introduction.tex +++ b/introduction.tex @@ -124,7 +124,7 @@ In the following, we will refer to systems which use deep networks for all optimization and do not perform time-critical side computation (e.g. numerical optimization) at inference time as \emph{end-to-end} deep learning systems. -\paragraph{Deep networks in optical flow} +\paragraph{Deep networks in optical flow estimation} End-to-end deep networks for optical flow were recently introduced based on encoder-decoder networks or CNN pyramids \cite{FlowNet, FlowNet2, SPyNet}, diff --git a/long_extended.bib b/long_extended.bib index c6b4e27..275ff1f 100644 --- a/long_extended.bib +++ b/long_extended.bib @@ -1253,7 +1253,8 @@ @String{aaai-2018 = "Proceedings of the Thirty-Second AAAI Conference on Artificial Intelligence"} @String{aaai-2018-yr = "2018"} -@String{aaai-2018-adr = "Menlo Park, California"} +@String{aaai-2018-mon = feb} +@String{aaai-2018-adr = "New Orleans, Louisiana"} @String{aaai-2018-pub = "AAAI Press"} @String{aaai-2005 = "Proceedings of the Twentieth International Conference on Machine Learning"} diff --git a/old.bib b/old.bib deleted file mode 100644 index 9fa04b6..0000000 --- a/old.bib +++ /dev/null @@ -1,281 +0,0 @@ -@inproceedings{FlowNet, - author = {Alexey Dosovitskiy and Philipp Fischer and Eddy Ilg - and Philip H{\"a}usser and Caner Haz{\i}rba{\c{s}} and - Vladimir Golkov and Patrick v.d. Smagt and Daniel Cremers and Thomas Brox}, - title = {{FlowNet}: Learning Optical Flow with Convolutional Networks}, - booktitle = {{ICCV}}, - year = {2015}} - -@inproceedings{FlowNet2, - author = {Eddy Ilg and Nikolaus Mayer and Tonmoy Saikia and - Margret Keuper and Alexey Dosovitskiy and Thomas Brox}, - title = {{FlowNet} 2.0: {E}volution of Optical Flow Estimation with Deep Networks}, - booktitle = {{CVPR}}, - year = {2017},} - -@inproceedings{SceneFlowDataset, - author = {Nikolaus Mayer and Eddy Ilg and Philip H{\"a}usser and Philipp Fischer and - Daniel Cremers and Alexey Dosovitskiy and Thomas Brox}, - title = {A Large Dataset to Train Convolutional Networks for - Disparity, Optical Flow, and Scene Flow Estimation}, - booktitle = {{CVPR}}, - year = {2016}} - -@article{SfmNet, - author = {Sudheendra Vijayanarasimhan and - Susanna Ricco and - Cordelia Schmid and - Rahul Sukthankar and - Katerina Fragkiadaki}, - title = {{SfM-Net}: Learning of Structure and Motion from Video}, - journal = {arXiv preprint arXiv:1704.07804}, - year = {2017}} - -@inproceedings{MaskRCNN, - Author = {Kaiming He and Georgia Gkioxari and - Piotr Doll\'{a}r and Ross Girshick}, - Title = {{Mask {R-CNN}}}, - Booktitle = {CVPR}, - Year = {2017}} - -@inproceedings{FasterRCNN, - Author = {Shaoqing Ren and Kaiming He and - Ross Girshick and Jian Sun}, - Title = {Faster {R-CNN}: Towards Real-Time Object Detection - with Region Proposal Networks}, - Booktitle = {{NIPS}}, - Year = {2015}} - -@inproceedings{FastRCNN, - Author = {Ross Girshick}, - Title = {Fast {R-CNN}}, - Booktitle = {{ICCV}}, - Year = {2015}} - -@inproceedings{InstanceSceneFlow, - Author = {Aseem Behl and Omid Hosseini Jafari and Siva Karthik Mustikovela and - Hassan Abu Alhaija and Carsten Rother and Andreas Geiger}, - Title = {Bounding Boxes, Segmentations and Object Coordinates: - How Important is Recognition for 3D Scene Flow Estimation - in Autonomous Driving Scenarios?}, - Booktitle = {{ICCV}}, - Year = {2017}} - -@inproceedings{RCNN, - Author = {Ross Girshick and - Jeff Donahue and - Trevor Darrell and - Jitendra Malik}, - Title = {Rich feature hierarchies for accurate - object detection and semantic segmentation}, - Booktitle = {{CVPR}}, - Year = {2014}} - -@inproceedings{ImageNetCNN, - title = {ImageNet Classification with Deep Convolutional Neural Networks}, - author = {Alex Krizhevsky and Sutskever, Ilya and Hinton, Geoffrey E.}, - booktitle = {{NIPS}}, - year = {2012}} - -@inproceedings{VGGNet, - author = {Karen Simonyan and Andrew Zisserman}, - title = {Very Deep Convolutional Networks for Large-Scale Image Recognition}, - booktitle = {ICLR}, - year = {2015}} - -@inproceedings{ResNet, - author = {Kaiming He and Xiangyu Zhang and Shaoqing Ren and Jian Sun}, - title = {Deep Residual Learning for Image Recognition}, - booktitle = {CVPR}, - year = {2016}} - -@inproceedings{DenseNetDenseFlow, - author = {Yi Zhu and Shawn D. Newsam}, - title = {DenseNet for Dense Flow}, - booktitle = {ICIP}, - year = {2017}} - -@inproceedings{SE3Nets, - author = {Arunkumar Byravan and Dieter Fox}, - title = {{SE3-Nets}: Learning Rigid Body Motion using Deep Neural Networks}, - booktitle = {{ICRA}}, - year = {2017}} - -@inproceedings{FlowLayers, - author = {Laura Sevilla-Lara and Deqing Sun and Varun Jampani and Michael J. Black}, - title = {Optical Flow with Semantic Segmentation and Localized Layers}, - booktitle = {{CVPR}}, - year = {2016}} - -@inproceedings{ESI, - author = {Min Bai and Wenjie Luo and Kaustav Kundu and Raquel Urtasun}, - title = {Exploiting Semantic Information and Deep Matching for Optical Flow}, - booktitle = {{ECCV}}, - year = {2016}} - -@inproceedings{VKITTI, - author = {Adrien Gaidon and Qiao Wang and Yohann Cabon and Eleonora Vig}, - title = {Virtual Worlds as Proxy for Multi-Object Tracking Analysis}, - booktitle = {{CVPR}}, - year = {2016}} - -@inproceedings{KITTI2012, - author = {Andreas Geiger and Philip Lenz and Raquel Urtasun}, - title = {Are we ready for Autonomous Driving? The {KITTI} Vision Benchmark Suite}, - booktitle = {{CVPR}}, - year = {2012}} - -@inproceedings{KITTI2015, - author = {Moritz Menze and Andreas Geiger}, - title = {Object Scene Flow for Autonomous Vehicles}, - booktitle = {{CVPR}}, - year = {2015}} - -@inproceedings{PRSF, - author = {C. Vogel and K. Schindler and S. Roth}, - title = {Piecewise Rigid Scene Flow}, - booktitle = {{ICCV}}, - year = {2013}} - -@inproceedings{PRSM, - author = {C. Vogel and K. Schindler and S. Roth}, - title = {3D Scene Flow with a Piecewise Rigid Scene Model}, - booktitle = {{IJCV}}, - year = {2015}} - -@inproceedings{MRFlow, - author = {Jonas Wulff and Laura Sevilla-Lara and Michael J. Black}, - title = {Optical Flow in Mostly Rigid Scenes}, - booktitle = {{CVPR}}, - year = {2017}} - -@inproceedings{SPyNet, - author = {Anurag Ranjan and Michael J. Black}, - title = {Optical Flow Estimation using a Spatial Pyramid Network}, - booktitle = {CVPR}, - year = {2017}} - -@inproceedings{FPN, - author = {Tsung-Yi Lin and Piotr Dollár and Ross Girshick and Kaiming He and Bharath Hariharan and Serge Belongie}, - title = {Feature Pyramid Networks for Object Detection}, - booktitle = {CVPR}, - year = {2017}} - -@inproceedings{PoseNet, - author = {Alex Kendall and Matthew Grimes and Roberto Cipolla}, - title = {PoseNet: A Convolutional Network for Real-Time 6-DOF Camera Relocalization}, - booktitle = {ICCV}, - year = {2015}} - -@inproceedings{PoseNet2, - author = {Alex Kendall and Roberto Cipolla}, - title = {Geometric loss functions for camera pose regression with deep learning}, - booktitle = {CVPR}, - year = {2017}} - -@inproceedings{STN, - author = {M. Jadeberg and K. Zisserman and K. Kavukcuoglu}, - title = {Spatial transformer networks}, - booktitle = {NIPS}, - year = {2015}} - -@inproceedings{CensusTerm, - author = {Fridtjof Stein}, - title = {Efficient Computation of Optical Flow Using the Census Transform}, - booktitle = {{DAGM} Symposium}, - year = {2004}} - -@inproceedings{DeeperDepth, - author = {Iro Laina and Christian Rupprecht and Vasileios Belagiannis and Federico Tombari and Nassir Navab}, - title = {Deeper Depth Prediction with Fully Convolutional Residual Networks}, - booktitle = {3DV}, - year = {2016}} - -@inproceedings{TensorFlowObjectDetection, - author = {J. Huang and V. Rathod and C. Sun and M. Zhu and A. Korattikara and A. Fathi and I. Fischer and Z. Wojna, - and Y. Song and S. Guadarrama and K. Murphy}, - title = {Speed/accuracy trade-offs for modern convolutional object detectors}, - booktitle = {CVPR}, - year = {2017}} - -@misc{TensorFlow, - title={{TensorFlow}: Large-Scale Machine Learning on Heterogeneous Systems}, - url={http://tensorflow.org/}, - note={Software available from tensorflow.org}, - author={Martín Abadi and others}, - year={2015}} - -@inproceedings{LSTM, - author = {Sepp Hochreiter and Jürgen Schmidhuber}, - title = {Long Short-Term Memory}, - booktitle = {Neural Computation}, - year = {1997}} - -@inproceedings{TemporalSF, - author = {Christoph Vogel and Stefan Roth and Konrad Schindler}, - title = {View-Consistent 3D Scene Flow Estimation over Multiple Frames}, - booktitle = {ECCV}, - year = {2014}} - -@inproceedings{Cityscapes, - author = {M. Cordts and M. Omran and S. Ramos and T. Rehfeld and - M. Enzweiler and R. Benenson and U. Franke and S. Roth and B. Schiele}, - title = {The Cityscapes Dataset for Semantic Urban Scene Understanding}, - booktitle = {CVPR}, - year = {2016}} - -@inproceedings{SGD, - author = {Y. LeCun and B. Boser and J. S. Denker and D. Henderson - and R. E. Howard and W. Hubbard and L. D. Jackel}, - title = {Backpropagation applied to handwritten zip code recognition}, - booktitle = {Neural Computation}, - year = {1989}} - -@inproceedings{GCNet, - author = {Alex Kendall and Hayk Martirosyan and Saumitro Dasgupta and Peter Henry - Ryan Kennedy and Abraham Bachrach and Adam Bry}, - title = {End-to-End Learning of Geometry and Context for Deep Stereo Regression}, - booktitle = {CVPR}, - year = {2017}} - -@inproceedings{BN, - author = {Sergey Ioffe and Christian Szegedy}, - title = {Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift}, - booktitle = {ICML}, - year = {2015}} - -@inproceedings{He, - author = {Kaiming He and Xiangyu Zhang and Shaoqing Ren and Jian Sun}, - title = {Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification}, - booktitle = {ICCV}, - year = {2015}} - -@inproceedings{UnFlow, - author = {Simon Meister and Junhwa Hur and Stefan Roth}, - title = {UnFlow: Unsupervised Learning of Optical Flow with a Bidirectional Census Loss}, - booktitle = {AAAI}, - year = {2018}} - -@inproceedings{UnsupDepth, - title={Unsupervised Learning of Depth and Ego-Motion from Video}, - author={Ravi Garg and BG Vijay Kumar and Gustavo Carneiro and Ian Reid}, - booktitle={ECCV}, - year={2016}} - -@inproceedings{UnsupPoseDepth, - title={Unsupervised CNN for single view depth estimation: Geometry to the rescue}, - author={Tinghui Zhou and Matthew Brown and Noah Snavely and David G. Lowe}, - booktitle={CVPR}, - year={2017}} - -@inproceedings{UnsupFlownet, - title={Back to Basics: Unsupervised Learning of Optical Flow via Brightness Constancy and Motion Smoothness}, - author={Jason J. Yu and Adam W. Harley and Konstantinos G. Derpanis}, - booktitle={ECCV Workshops}, - year={2016}} - - @inproceedings{ImageNet, - title={ImageNet Large Scale Visual Recognition Challenge}, - author={Olga Russakovsky and others}, - booktitle={IJCV}, - year={2015}} diff --git a/thesis.tex b/thesis.tex index 3c12476..8d5eb06 100644 --- a/thesis.tex +++ b/thesis.tex @@ -65,6 +65,8 @@ \DeclareFieldFormat*{title}{\mkbibemph{#1\isdot}} % zitierte Titel kursiv formatieren \addbibresource{long_extended.bib} % Hier Pfad zu deiner .bib-Datei hineinschreiben +%\addbibresource{external.bib} % Hier Pfad zu deiner .bib-Datei hineinschreiben +%\addbibresource{papers.bib} % Hier Pfad zu deiner .bib-Datei hineinschreiben \addbibresource{bib.bib} % Hier Pfad zu deiner .bib-Datei hineinschreiben \nocite{*} % Alle Einträge in der .bib-Datei im Literaturverzeichnis ausgeben, auch wenn sie nicht im Text zitiert werden. Gut zum Testen der .bib-Datei, sollte aber nicht generell verwendet werden. Stattdessen lieber gezielt Einträge mit Keywords ausgeben lassen (siehe \printbibliography in Zeile 224). @@ -94,19 +96,19 @@ \author{\myname} \thesistitle{\mytitleen}{\mytitlede} \birthplace{Erbach} -\date{21.11.2017} +\date{22.11.2017} \referee{\myprof}{M.Sc. Junhwa Hur} \department{\myinstitute} \group{\myfaculty} -\dateofexam{23.11.2017}{23.11.2017} +\dateofexam{23.11.2017}{22.11.2017} \makethesistitle % Eigenständigkeitserklärung: muss nach \makethesistitle erscheinen, sonst wird sie als erste Seite des Dokuments gesetzt. -\affidavit[23.11.2017]{\myname} +\affidavit[22.11.2017]{\myname} %\affidavit{\myname} \pagestyle{myheadings} % Seitenstil umschalten -\mymarkright{Version: \today} % Inhalt der Fußzeile +%\mymarkright{Version: \today} % Inhalt der Fußzeile \input{abstract}