diff --git a/approach.tex b/approach.tex index 56a8b53..0a43dcd 100644 --- a/approach.tex +++ b/approach.tex @@ -1,6 +1,6 @@ -\subsection{Motion R-CNN architecture} -\label{ssec:architecture} +\subsection{Motion R-CNN} +\label{ssec:model} Building on Mask R-CNN \cite{MaskRCNN}, we estimate per-object motion by predicting the 3D motion of each detected object. @@ -26,25 +26,25 @@ C$_4$ & ResNet-50 \{up to C$_4$\} (Table \ref{table:resnet}) & $\tfrac{1}{16}$ H \multicolumn{3}{c}{\textbf{Camera Motion Network}}\\ \midrule & From C$_4$: ResNet-50 \{C$_5$\} (Table \ref{table:resnet}) & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 2048 \\ -& 1 $\times$ 1 conv, 2048 & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 2048 \\ -& 3 $\times$ 3 conv, 2048, stride 2 & $\tfrac{1}{64}$ H $\times$ $\tfrac{1}{64}$ W $\times$ 2048 \\ -& average pool & 1 $\times$ 2048 \\ -M$_1$ & $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & 1 $\times$ 1024 \\ +& bilinear resize, 7 $\times$ 7 & 7 $\times$ 7 $\times$ 512 \\ +& flatten & 1 $\times$ 7 $\cdot$ 7 $\cdot$ 512 \\ +T$_0$ & $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & 1 $\times$ 1024 \\ -$R_t^{cam}$& From M$_1$: fully connected, 3 & 1 $\times$ 3 \\ -$t_t^{cam}$& From M$_1$: fully connected, 3 & 1 $\times$ 3 \\ -& From M$_1$: fully connected, 2 & 1 $\times$ 2 \\ +$R_t^{cam}$& From T$_0$: fully connected, 3 & 1 $\times$ 3 \\ +$t_t^{cam}$& From T$_0$: fully connected, 3 & 1 $\times$ 3 \\ +& From T$_0$: fully connected, 2 & 1 $\times$ 2 \\ $o_t^{cam}$& softmax, 2 & 1 $\times$ 2 \\ \midrule \multicolumn{3}{c}{\textbf{RoI Head \& RoI Head: Masks} (Table \ref{table:maskrcnn_resnet})}\\ \midrule \multicolumn{3}{c}{\textbf{RoI Head: Motions}}\\ \midrule -M$_2$ & From ave: $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & N$_{RPN}$ $\times$ 1024 \\ -$\forall k: R_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\ -$\forall k: t_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\ -$\forall k: p_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\ -& From M$_2$: fully connected, 2 & N$_{RPN}$ $\times$ 2 \\ +& From M$_0$: flatten & N$_{RPN}$ $\times$ 7 $\cdot$ 7 $\cdot$ 256 \\ +T$_1$ & $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & N$_{RPN}$ $\times$ 1024 \\ +$\forall k: R_t^k$ & From T$_1$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\ +$\forall k: t_t^k$ & From T$_1$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\ +$\forall k: p_t^k$ & From T$_1$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\ +& From T$_1$: fully connected, 2 & N$_{RPN}$ $\times$ 2 \\ $\forall k: o_t^k$ & softmax, 2 & N$_{RPN}$ $\times$ 2 \\ \bottomrule @@ -53,6 +53,8 @@ $\forall k: o_t^k$ & softmax, 2 & N$_{RPN}$ $\times$ 2 \\ \caption { Motion R-CNN ResNet-50 architecture based on the Mask R-CNN ResNet-50 architecture (Table \ref{table:maskrcnn_resnet}). +We use ReLU activations after all hidden layers and +additonally dropout with $p = 0.5$ after all fully-connected hidden layers. } \label{table:motionrcnn_resnet} \end{table} @@ -67,30 +69,31 @@ ResNet-50 architecture (Table \ref{table:maskrcnn_resnet}). \midrule\midrule & input images & H $\times$ W $\times$ C \\ \midrule -C$_5$ & ResNet-50 (Table \ref{table:resnet}) & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 1024 \\ +C$_5$ & ResNet-50 (Table \ref{table:resnet}) & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 2048 \\ \midrule \multicolumn{3}{c}{\textbf{RPN \& FPN} (Table \ref{table:maskrcnn_resnet_fpn})} \\ \midrule \multicolumn{3}{c}{\textbf{Camera Motion Network}}\\ \midrule -& From C$_5$: 1 $\times$ 1 conv, 2048 & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 2048 \\ -& 3 $\times$ 3 conv, 2048, stride 2 & $\tfrac{1}{64}$ H $\times$ $\tfrac{1}{64}$ W $\times$ 2048 \\ -& average pool & 1 $\times$ 2048 \\ -M$_1$ & $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & 1 $\times$ 1024 \\ -$R_t^{cam}$& From M$_1$: fully connected, 3 & 1 $\times$ 3 \\ -$t_t^{cam}$& From M$_1$: fully connected, 3 & 1 $\times$ 3 \\ -& From M$_1$: fully connected, 2 & 1 $\times$ 2 \\ +& From C$_5$: 1 $\times$ 1 conv, 512 & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 512 \\ +& bilinear resize, 7 $\times$ 7 & 7 $\times$ 7 $\times$ 512 \\ +& flatten & 1 $\times$ 7 $\cdot$ 7 $\cdot$ 512 \\ +T$_2$ & $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & 1 $\times$ 1024 \\ +$R_t^{cam}$& From T$_2$: fully connected, 3 & 1 $\times$ 3 \\ +$t_t^{cam}$& From T$_2$: fully connected, 3 & 1 $\times$ 3 \\ +& From T$_2$: fully connected, 2 & 1 $\times$ 2 \\ $o_t^{cam}$& softmax, 2 & 1 $\times$ 2 \\ \midrule \multicolumn{3}{c}{\textbf{RoI Head \& RoI Head: Masks} (Table \ref{table:maskrcnn_resnet_fpn})} \\ \midrule \multicolumn{3}{c}{\textbf{RoI Head: Motions}}\\ \midrule -M$_2$ & From F$_1$: $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & N$_{RPN}$ $\times$ 1024 \\ -$\forall k: R_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\ -$\forall k: t_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\ -$\forall k: p_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\ -& From M$_2$: fully connected, 2 & N$_{RPN}$ $\times$ 2 \\ +& From M$_1$: flatten & N$_{RPN}$ $\times$ 14 $\cdot$ 14 $\cdot$ 256 \\ +T$_3$ & $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & N$_{RPN}$ $\times$ 1024 \\ +$\forall k: R_t^k$ & From T$_3$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\ +$\forall k: t_t^k$ & From T$_3$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\ +$\forall k: p_t^k$ & From T$_3$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\ +& From T$_2$: fully connected, 2 & N$_{RPN}$ $\times$ 2 \\ $\forall k: o_t^k$ & softmax, 2 & N$_{RPN}$ $\times$ 2 \\ \bottomrule @@ -101,12 +104,14 @@ Motion R-CNN ResNet-50-FPN architecture based on the Mask R-CNN ResNet-50-FPN architecture (Table \ref{table:maskrcnn_resnet_fpn}). The modifications are analogous to our Motion R-CNN ResNet-50, but we still show the architecture for completeness. +Again, we use ReLU activations after all hidden layers and +additonally dropout with $p = 0.5$ after all fully-connected hidden layers. } \label{table:motionrcnn_resnet_fpn} \end{table} } -\paragraph{Backbone Network} +\paragraph{Motion R-CNN backbone} Like Faster R-CNN and Mask R-CNN, we use a ResNet \cite{ResNet} variant as backbone network to compute feature maps from input imagery. Inspired by FlowNetS \cite{FlowNet}, we make one modification to the ResNet backbone to enable image matching, @@ -165,8 +170,9 @@ R_t^{k,z}(\gamma) = and $\alpha, \beta, \gamma$ are the rotation angles in radians about the $x,y,z$-axis, respectively. -We then extend the Mask R-CNN head by adding a fully connected layer in parallel to the fully connected layers for -refined boxes and classes. +We then extend the Mask R-CNN head by adding a small fully-connected network for motion +prediction in addition to the fully-connected layers for +refined boxes and classes and the convolutional network for the masks. Like for refined boxes and masks, we make one separate motion prediction for each class. Each instance motion is predicted as a set of nine scalar parameters, $\sin(\alpha)$, $\sin(\beta)$, $\sin(\gamma)$, $t_t^k$ and $p_t^k$, @@ -180,15 +186,46 @@ still and moving objects. As a postprocessing, for any object instance $k$ with we set $\sin(\alpha) = \sin(\beta) = \sin(\gamma) = 0$ and $t_t^k = (0,0,0)^T$, and thus predict an identity motion. + \paragraph{Camera motion prediction} In addition to the object transformations, we optionally predict the camera motion $\{R_t^{cam}, t_t^{cam}\}\in \mathbf{SE}(3)$ between the two frames $I_t$ and $I_{t+1}$. -For this, we flatten the bottleneck output of the backbone and pass it through a fully connected layer. +For this, we branch off a small fully-connected network from the bottleneck output of the backbone. We again represent $R_t^{cam}$ using a Euler angle representation and predict $\sin(\alpha)$, $\sin(\beta)$, $\sin(\gamma)$ and $t_t^{cam}$ in the same way as for the individual objects. Again, we predict a softmax score $o_t^{cam}$ for differentiating between a still and moving camera. +\subsection{Motion R-CNN network design} + +\label{ssec:design} +\paragraph{Camera motion network} +In our ResNet-50 variant (Table \ref{table:motionrcnn_resnet}), the underlying +ResNet backbone is only computed up to the $C_4$ block, as otherwise the +feature resolution for RoI extraction would be reduced too much. +In our ResNet-50 variant, we first pass the $C_4$ features through a $C_5$ +block to make the camera network of both variants comparable. +Then, in both, the ResNet-50 and ResNet-50-FPN variant (Table \ref{table:motionrcnn_resnet_fpn}), we apply a additional +convolution to the $C_5$ features to reduce the number of inputs to the following +fully-connected layers. +Instead of averaging, we use bilinear resizing to bring the convolutional features +to a fixed size without losing spatial information, +flatten them, and finally apply multiple fully-connected layers to compute the +camera motion prediction. + +\paragraph{RoI motion head network} +In both of our network variants +(Tables \ref{table:motionrcnn_resnet} and \ref{table:motionrcnn_resnet_fpn}), +we compute the fully-connected network for motion prediction from the +convolutional mask features, branching off right before the mask upsampling +deconvolution. The intuition behind this is that the final mask features contain +high resolution, spatial information about which positions belong to the object and +which belong to the background. Thus, we allow the motion estimation network to +make use of this data and ideally integrate the motion (image matching) information +localized within the object, but not that belonging to the background, +into the final object motion estimate. + + \subsection{Supervision} \label{ssec:supervision} diff --git a/background.tex b/background.tex index 2625c3a..0808417 100644 --- a/background.tex +++ b/background.tex @@ -22,7 +22,7 @@ Deep convolutional neural network (CNN) architectures became widely popular through numerous successes in classification and recognition tasks. The general structure of a CNN consists of a convolutional encoder, which learns a spatially compressed, wide (in the number of channels) representation of the input image, -and a fully connected prediction network on top of the encoder. +and a fully-connected prediction network on top of the encoder. The compressed representations learned by CNNs of these categories do not, however, allow for prediction of high-resolution output, as spatial detail is lost through sequential applications @@ -190,6 +190,7 @@ Operations enclosed in a []$_b$ block make up a single ResNet \enquote{bottlenec block (see Figure \ref{figure:bottleneck}). If the block is denoted as []$_b/2$, the first conv operation in the block has a stride of 2. Note that the stride is only applied to the first block, but not to repeated blocks. +Batch normalization \cite{BN} is used after every convolution. } \label{table:resnet} \end{longtable} @@ -317,7 +318,7 @@ classes& softmax, N$_{cls}$ & N$_{RoI}$ $\times$ N$_{cls}$ \\ \midrule \multicolumn{3}{c}{\textbf{RoI Head: Masks}}\\ \midrule -& From R$_1$: 2 $\times$ 2 deconv, 256, stride 2 & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ 256 \\ +M$_0$ & From R$_1$: 2 $\times$ 2 deconv, 256, stride 2 & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ 256 \\ & 1 $\times$ 1 conv, N$_{cls}$ & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ N$_{cls}$ \\ masks & sigmoid, N$_{cls}$ & N$_{RoI}$ $\times$ 28 $\times$ 28 $\times$ N$_{cls}$ \\ @@ -410,7 +411,7 @@ classes& softmax, N$_{cls}$ & N$_{RoI}$ $\times$ N$_{cls}$ \\ \midrule \multicolumn{3}{c}{\textbf{RoI Head: Masks}}\\ \midrule -& From R$_2$: $\begin{bmatrix}\textrm{3 $\times$ 3 conv} \end{bmatrix}$ $\times$ 4, 256 & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ 256 \\ +M$_1$ & From R$_2$: $\begin{bmatrix}\textrm{3 $\times$ 3 conv} \end{bmatrix}$ $\times$ 4, 256 & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ 256 \\ & 2 $\times$ 2 deconv, 256, stride 2 & N$_{RoI}$ $\times$ 28 $\times$ 28 $\times$ 256 \\ & 1 $\times$ 1 conv, N$_{cls}$ & N$_{RoI}$ $\times$ 28 $\times$ 28 $\times$ N$_{cls}$ \\ masks & sigmoid, N$_{cls}$ & N$_{RoI}$ $\times$ 28 $\times$ 28 $\times$ N$_{cls}$ \\ diff --git a/bib.bib b/bib.bib index 705fb39..35976d7 100644 --- a/bib.bib +++ b/bib.bib @@ -237,3 +237,15 @@ title = {End-to-End Learning of Geometry and Context for Deep Stereo Regression}, booktitle = {CVPR}, year = {2017}} + +@inproceedings{BN, + author = {Sergey Ioffe and Christian Szegedy}, + title = {Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift}, + booktitle = {ICML}, + year = {2015}} + +@inproceedings{He, + author = {Kaiming He and Xiangyu Zhang and Shaoqing Ren and Jian Sun}, + title = {Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification}, + booktitle = {ICCV}, + year = {2015}} diff --git a/experiments.tex b/experiments.tex index 263084d..a3cea47 100644 --- a/experiments.tex +++ b/experiments.tex @@ -41,9 +41,6 @@ to be used as validation set. From the remaining 2026 examples, we remove a small number of examples without object instances and use the resulting data as training set. -\paragraph{KITTI 2015} -\todo{add this if adding KITTI 2015 evaluations} - \paragraph{Motion ground truth from 3D poses and camera extrinsics} We will now describe how we use the ground truth poses and camera matrices from Virtual KITTI to compute instance and camera motion ground truth. @@ -161,7 +158,20 @@ As learning rate we use $0.25 \cdot 10^{-2}$ for the first 144K iterations and $0.25 \cdot 10^{-3}$ for all remaining iterations. \paragraph{R-CNN training parameters} -\todo{add this} +For training the RPN and RoI heads and during inference, +we use the exact same number of proposals and RoIs as Mask R-CNN in +the ResNet-50 and ResNet-50-FPN variants, respectively. + +\paragraph{Initialization} +Following the pre-existing TensorFlow implementation of Faster R-CNN, +we initialize all hidden layers with He initialization \cite{He}. +For the fully-connected camera and instance motion output layers, +we use a truncated normal initializer with a standard +deviation of $0.0001$ and zero mean, truncated at two standard deviations. +Note that a larger weight prevented the +angle sine estimates from properly converging to the very small values they +are in general expected to output. + \subsection{Experiments on Virtual KITTI} \label{ssec:vkitti} @@ -221,7 +231,3 @@ Figure \ref{figure:vkitti} visualizes instance segmentation and optical flow results on the Virtual KITTI validation set. Table \ref{table:vkitti} compares the performance of different network variants on the Virtual KITTI validation set. - -\subsection{Evaluation on KITTI 2015} -\todo{if there is enough time, -add evaluation (table + visualizations) of instance segmentation and composed optical flow on RGB-D frames from KITTI 2015 train} diff --git a/introduction.tex b/introduction.tex index 35732c8..1af994d 100644 --- a/introduction.tex +++ b/introduction.tex @@ -40,7 +40,7 @@ the location and 3D motion of each object instance relative to the camera Recently, SfM-Net \cite{SfmNet} introduced an end-to-end deep learning approach for predicting depth and dense optical flow in monocular image sequences based on estimating the 3D motion of individual objects and the camera. SfM-Net predicts a batch of binary full image masks specyfing the object memberships of individual pixels with a standard encoder-decoder -network for pixel-wise prediction. A fully connected network branching off the encoder predicts a 3D motion for each object. +network for pixel-wise prediction. A fully-connected network branching off the encoder predicts a 3D motion for each object. However, due to the fixed number of objects masks, the system can only predict a small number of motions and often fails to properly segment the pixels into the correct masks or assigns background pixels to object motions (Figure \ref{figure:sfmnet_kitti}). \begin{figure}[t] @@ -95,8 +95,8 @@ manageable pieces. \includegraphics[width=\textwidth]{figures/net_intro} \caption{ Overview of our network based on Mask R-CNN. For each RoI, we predict the instance motion -in parallel to the class, bounding box and mask. We branch off a fully connected -layer for predicting the camera motion from the bottleneck. +in parallel to the class, bounding box and mask. We branch off a additionaly +small network for predicting the camera motion from the bottleneck. } \label{figure:net_intro} \end{figure}