This commit is contained in:
Simon Meister 2017-11-13 13:54:48 +01:00
parent 024af8fede
commit d093e7d2ff
5 changed files with 101 additions and 45 deletions

View File

@ -1,6 +1,6 @@
\subsection{Motion R-CNN architecture}
\label{ssec:architecture}
\subsection{Motion R-CNN}
\label{ssec:model}
Building on Mask R-CNN \cite{MaskRCNN},
we estimate per-object motion by predicting the 3D motion of each detected object.
@ -26,25 +26,25 @@ C$_4$ & ResNet-50 \{up to C$_4$\} (Table \ref{table:resnet}) & $\tfrac{1}{16}$ H
\multicolumn{3}{c}{\textbf{Camera Motion Network}}\\
\midrule
& From C$_4$: ResNet-50 \{C$_5$\} (Table \ref{table:resnet}) & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 2048 \\
& 1 $\times$ 1 conv, 2048 & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 2048 \\
& 3 $\times$ 3 conv, 2048, stride 2 & $\tfrac{1}{64}$ H $\times$ $\tfrac{1}{64}$ W $\times$ 2048 \\
& average pool & 1 $\times$ 2048 \\
M$_1$ & $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & 1 $\times$ 1024 \\
& bilinear resize, 7 $\times$ 7 & 7 $\times$ 7 $\times$ 512 \\
& flatten & 1 $\times$ 7 $\cdot$ 7 $\cdot$ 512 \\
T$_0$ & $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & 1 $\times$ 1024 \\
$R_t^{cam}$& From M$_1$: fully connected, 3 & 1 $\times$ 3 \\
$t_t^{cam}$& From M$_1$: fully connected, 3 & 1 $\times$ 3 \\
& From M$_1$: fully connected, 2 & 1 $\times$ 2 \\
$R_t^{cam}$& From T$_0$: fully connected, 3 & 1 $\times$ 3 \\
$t_t^{cam}$& From T$_0$: fully connected, 3 & 1 $\times$ 3 \\
& From T$_0$: fully connected, 2 & 1 $\times$ 2 \\
$o_t^{cam}$& softmax, 2 & 1 $\times$ 2 \\
\midrule
\multicolumn{3}{c}{\textbf{RoI Head \& RoI Head: Masks} (Table \ref{table:maskrcnn_resnet})}\\
\midrule
\multicolumn{3}{c}{\textbf{RoI Head: Motions}}\\
\midrule
M$_2$ & From ave: $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & N$_{RPN}$ $\times$ 1024 \\
$\forall k: R_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
$\forall k: t_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
$\forall k: p_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
& From M$_2$: fully connected, 2 & N$_{RPN}$ $\times$ 2 \\
& From M$_0$: flatten & N$_{RPN}$ $\times$ 7 $\cdot$ 7 $\cdot$ 256 \\
T$_1$ & $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & N$_{RPN}$ $\times$ 1024 \\
$\forall k: R_t^k$ & From T$_1$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
$\forall k: t_t^k$ & From T$_1$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
$\forall k: p_t^k$ & From T$_1$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
& From T$_1$: fully connected, 2 & N$_{RPN}$ $\times$ 2 \\
$\forall k: o_t^k$ & softmax, 2 & N$_{RPN}$ $\times$ 2 \\
\bottomrule
@ -53,6 +53,8 @@ $\forall k: o_t^k$ & softmax, 2 & N$_{RPN}$ $\times$ 2 \\
\caption {
Motion R-CNN ResNet-50 architecture based on the Mask R-CNN
ResNet-50 architecture (Table \ref{table:maskrcnn_resnet}).
We use ReLU activations after all hidden layers and
additonally dropout with $p = 0.5$ after all fully-connected hidden layers.
}
\label{table:motionrcnn_resnet}
\end{table}
@ -67,30 +69,31 @@ ResNet-50 architecture (Table \ref{table:maskrcnn_resnet}).
\midrule\midrule
& input images & H $\times$ W $\times$ C \\
\midrule
C$_5$ & ResNet-50 (Table \ref{table:resnet}) & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 1024 \\
C$_5$ & ResNet-50 (Table \ref{table:resnet}) & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 2048 \\
\midrule
\multicolumn{3}{c}{\textbf{RPN \& FPN} (Table \ref{table:maskrcnn_resnet_fpn})} \\
\midrule
\multicolumn{3}{c}{\textbf{Camera Motion Network}}\\
\midrule
& From C$_5$: 1 $\times$ 1 conv, 2048 & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 2048 \\
& 3 $\times$ 3 conv, 2048, stride 2 & $\tfrac{1}{64}$ H $\times$ $\tfrac{1}{64}$ W $\times$ 2048 \\
& average pool & 1 $\times$ 2048 \\
M$_1$ & $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & 1 $\times$ 1024 \\
$R_t^{cam}$& From M$_1$: fully connected, 3 & 1 $\times$ 3 \\
$t_t^{cam}$& From M$_1$: fully connected, 3 & 1 $\times$ 3 \\
& From M$_1$: fully connected, 2 & 1 $\times$ 2 \\
& From C$_5$: 1 $\times$ 1 conv, 512 & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 512 \\
& bilinear resize, 7 $\times$ 7 & 7 $\times$ 7 $\times$ 512 \\
& flatten & 1 $\times$ 7 $\cdot$ 7 $\cdot$ 512 \\
T$_2$ & $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & 1 $\times$ 1024 \\
$R_t^{cam}$& From T$_2$: fully connected, 3 & 1 $\times$ 3 \\
$t_t^{cam}$& From T$_2$: fully connected, 3 & 1 $\times$ 3 \\
& From T$_2$: fully connected, 2 & 1 $\times$ 2 \\
$o_t^{cam}$& softmax, 2 & 1 $\times$ 2 \\
\midrule
\multicolumn{3}{c}{\textbf{RoI Head \& RoI Head: Masks} (Table \ref{table:maskrcnn_resnet_fpn})} \\
\midrule
\multicolumn{3}{c}{\textbf{RoI Head: Motions}}\\
\midrule
M$_2$ & From F$_1$: $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & N$_{RPN}$ $\times$ 1024 \\
$\forall k: R_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
$\forall k: t_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
$\forall k: p_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
& From M$_2$: fully connected, 2 & N$_{RPN}$ $\times$ 2 \\
& From M$_1$: flatten & N$_{RPN}$ $\times$ 14 $\cdot$ 14 $\cdot$ 256 \\
T$_3$ & $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & N$_{RPN}$ $\times$ 1024 \\
$\forall k: R_t^k$ & From T$_3$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
$\forall k: t_t^k$ & From T$_3$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
$\forall k: p_t^k$ & From T$_3$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
& From T$_2$: fully connected, 2 & N$_{RPN}$ $\times$ 2 \\
$\forall k: o_t^k$ & softmax, 2 & N$_{RPN}$ $\times$ 2 \\
\bottomrule
@ -101,12 +104,14 @@ Motion R-CNN ResNet-50-FPN architecture based on the Mask R-CNN
ResNet-50-FPN architecture (Table \ref{table:maskrcnn_resnet_fpn}).
The modifications are analogous to our Motion R-CNN ResNet-50,
but we still show the architecture for completeness.
Again, we use ReLU activations after all hidden layers and
additonally dropout with $p = 0.5$ after all fully-connected hidden layers.
}
\label{table:motionrcnn_resnet_fpn}
\end{table}
}
\paragraph{Backbone Network}
\paragraph{Motion R-CNN backbone}
Like Faster R-CNN and Mask R-CNN, we use a ResNet \cite{ResNet} variant as backbone network to compute feature maps from input imagery.
Inspired by FlowNetS \cite{FlowNet}, we make one modification to the ResNet backbone to enable image matching,
@ -165,8 +170,9 @@ R_t^{k,z}(\gamma) =
and $\alpha, \beta, \gamma$ are the rotation angles in radians about the $x,y,z$-axis, respectively.
We then extend the Mask R-CNN head by adding a fully connected layer in parallel to the fully connected layers for
refined boxes and classes.
We then extend the Mask R-CNN head by adding a small fully-connected network for motion
prediction in addition to the fully-connected layers for
refined boxes and classes and the convolutional network for the masks.
Like for refined boxes and masks, we make one separate motion prediction for each class.
Each instance motion is predicted as a set of nine scalar parameters,
$\sin(\alpha)$, $\sin(\beta)$, $\sin(\gamma)$, $t_t^k$ and $p_t^k$,
@ -180,15 +186,46 @@ still and moving objects. As a postprocessing, for any object instance $k$ with
we set $\sin(\alpha) = \sin(\beta) = \sin(\gamma) = 0$ and $t_t^k = (0,0,0)^T$,
and thus predict an identity motion.
\paragraph{Camera motion prediction}
In addition to the object transformations, we optionally predict the camera motion $\{R_t^{cam}, t_t^{cam}\}\in \mathbf{SE}(3)$
between the two frames $I_t$ and $I_{t+1}$.
For this, we flatten the bottleneck output of the backbone and pass it through a fully connected layer.
For this, we branch off a small fully-connected network from the bottleneck output of the backbone.
We again represent $R_t^{cam}$ using a Euler angle representation and
predict $\sin(\alpha)$, $\sin(\beta)$, $\sin(\gamma)$ and $t_t^{cam}$ in the same way as for the individual objects.
Again, we predict a softmax score $o_t^{cam}$ for differentiating between
a still and moving camera.
\subsection{Motion R-CNN network design}
\label{ssec:design}
\paragraph{Camera motion network}
In our ResNet-50 variant (Table \ref{table:motionrcnn_resnet}), the underlying
ResNet backbone is only computed up to the $C_4$ block, as otherwise the
feature resolution for RoI extraction would be reduced too much.
In our ResNet-50 variant, we first pass the $C_4$ features through a $C_5$
block to make the camera network of both variants comparable.
Then, in both, the ResNet-50 and ResNet-50-FPN variant (Table \ref{table:motionrcnn_resnet_fpn}), we apply a additional
convolution to the $C_5$ features to reduce the number of inputs to the following
fully-connected layers.
Instead of averaging, we use bilinear resizing to bring the convolutional features
to a fixed size without losing spatial information,
flatten them, and finally apply multiple fully-connected layers to compute the
camera motion prediction.
\paragraph{RoI motion head network}
In both of our network variants
(Tables \ref{table:motionrcnn_resnet} and \ref{table:motionrcnn_resnet_fpn}),
we compute the fully-connected network for motion prediction from the
convolutional mask features, branching off right before the mask upsampling
deconvolution. The intuition behind this is that the final mask features contain
high resolution, spatial information about which positions belong to the object and
which belong to the background. Thus, we allow the motion estimation network to
make use of this data and ideally integrate the motion (image matching) information
localized within the object, but not that belonging to the background,
into the final object motion estimate.
\subsection{Supervision}
\label{ssec:supervision}

View File

@ -22,7 +22,7 @@ Deep convolutional neural network (CNN) architectures
became widely popular through numerous successes in classification and recognition tasks.
The general structure of a CNN consists of a convolutional encoder, which
learns a spatially compressed, wide (in the number of channels) representation of the input image,
and a fully connected prediction network on top of the encoder.
and a fully-connected prediction network on top of the encoder.
The compressed representations learned by CNNs of these categories do not, however, allow
for prediction of high-resolution output, as spatial detail is lost through sequential applications
@ -190,6 +190,7 @@ Operations enclosed in a []$_b$ block make up a single ResNet \enquote{bottlenec
block (see Figure \ref{figure:bottleneck}). If the block is denoted as []$_b/2$,
the first conv operation in the block has a stride of 2. Note that the stride
is only applied to the first block, but not to repeated blocks.
Batch normalization \cite{BN} is used after every convolution.
}
\label{table:resnet}
\end{longtable}
@ -317,7 +318,7 @@ classes& softmax, N$_{cls}$ & N$_{RoI}$ $\times$ N$_{cls}$ \\
\midrule
\multicolumn{3}{c}{\textbf{RoI Head: Masks}}\\
\midrule
& From R$_1$: 2 $\times$ 2 deconv, 256, stride 2 & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ 256 \\
M$_0$ & From R$_1$: 2 $\times$ 2 deconv, 256, stride 2 & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ 256 \\
& 1 $\times$ 1 conv, N$_{cls}$ & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ N$_{cls}$ \\
masks & sigmoid, N$_{cls}$ & N$_{RoI}$ $\times$ 28 $\times$ 28 $\times$ N$_{cls}$ \\
@ -410,7 +411,7 @@ classes& softmax, N$_{cls}$ & N$_{RoI}$ $\times$ N$_{cls}$ \\
\midrule
\multicolumn{3}{c}{\textbf{RoI Head: Masks}}\\
\midrule
& From R$_2$: $\begin{bmatrix}\textrm{3 $\times$ 3 conv} \end{bmatrix}$ $\times$ 4, 256 & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ 256 \\
M$_1$ & From R$_2$: $\begin{bmatrix}\textrm{3 $\times$ 3 conv} \end{bmatrix}$ $\times$ 4, 256 & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ 256 \\
& 2 $\times$ 2 deconv, 256, stride 2 & N$_{RoI}$ $\times$ 28 $\times$ 28 $\times$ 256 \\
& 1 $\times$ 1 conv, N$_{cls}$ & N$_{RoI}$ $\times$ 28 $\times$ 28 $\times$ N$_{cls}$ \\
masks & sigmoid, N$_{cls}$ & N$_{RoI}$ $\times$ 28 $\times$ 28 $\times$ N$_{cls}$ \\

12
bib.bib
View File

@ -237,3 +237,15 @@
title = {End-to-End Learning of Geometry and Context for Deep Stereo Regression},
booktitle = {CVPR},
year = {2017}}
@inproceedings{BN,
author = {Sergey Ioffe and Christian Szegedy},
title = {Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift},
booktitle = {ICML},
year = {2015}}
@inproceedings{He,
author = {Kaiming He and Xiangyu Zhang and Shaoqing Ren and Jian Sun},
title = {Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification},
booktitle = {ICCV},
year = {2015}}

View File

@ -41,9 +41,6 @@ to be used as validation set. From the remaining 2026 examples,
we remove a small number of examples without object instances and use the resulting
data as training set.
\paragraph{KITTI 2015}
\todo{add this if adding KITTI 2015 evaluations}
\paragraph{Motion ground truth from 3D poses and camera extrinsics}
We will now describe how we use the ground truth poses and camera matrices from Virtual KITTI to
compute instance and camera motion ground truth.
@ -161,7 +158,20 @@ As learning rate we use $0.25 \cdot 10^{-2}$ for the
first 144K iterations and $0.25 \cdot 10^{-3}$ for all remaining iterations.
\paragraph{R-CNN training parameters}
\todo{add this}
For training the RPN and RoI heads and during inference,
we use the exact same number of proposals and RoIs as Mask R-CNN in
the ResNet-50 and ResNet-50-FPN variants, respectively.
\paragraph{Initialization}
Following the pre-existing TensorFlow implementation of Faster R-CNN,
we initialize all hidden layers with He initialization \cite{He}.
For the fully-connected camera and instance motion output layers,
we use a truncated normal initializer with a standard
deviation of $0.0001$ and zero mean, truncated at two standard deviations.
Note that a larger weight prevented the
angle sine estimates from properly converging to the very small values they
are in general expected to output.
\subsection{Experiments on Virtual KITTI}
\label{ssec:vkitti}
@ -221,7 +231,3 @@ Figure \ref{figure:vkitti} visualizes instance segmentation and optical flow
results on the Virtual KITTI validation set.
Table \ref{table:vkitti} compares the performance of different network variants on the Virtual KITTI validation
set.
\subsection{Evaluation on KITTI 2015}
\todo{if there is enough time,
add evaluation (table + visualizations) of instance segmentation and composed optical flow on RGB-D frames from KITTI 2015 train}

View File

@ -40,7 +40,7 @@ the location and 3D motion of each object instance relative to the camera
Recently, SfM-Net \cite{SfmNet} introduced an end-to-end deep learning approach for predicting depth
and dense optical flow in monocular image sequences based on estimating the 3D motion of individual objects and the camera.
SfM-Net predicts a batch of binary full image masks specyfing the object memberships of individual pixels with a standard encoder-decoder
network for pixel-wise prediction. A fully connected network branching off the encoder predicts a 3D motion for each object.
network for pixel-wise prediction. A fully-connected network branching off the encoder predicts a 3D motion for each object.
However, due to the fixed number of objects masks, the system can only predict a small number of motions and
often fails to properly segment the pixels into the correct masks or assigns background pixels to object motions (Figure \ref{figure:sfmnet_kitti}).
\begin{figure}[t]
@ -95,8 +95,8 @@ manageable pieces.
\includegraphics[width=\textwidth]{figures/net_intro}
\caption{
Overview of our network based on Mask R-CNN. For each RoI, we predict the instance motion
in parallel to the class, bounding box and mask. We branch off a fully connected
layer for predicting the camera motion from the bottleneck.
in parallel to the class, bounding box and mask. We branch off a additionaly
small network for predicting the camera motion from the bottleneck.
}
\label{figure:net_intro}
\end{figure}