mirror of
https://github.com/tu-darmstadt-informatik/bsc-thesis.git
synced 2025-12-13 09:55:49 +00:00
WIP
This commit is contained in:
parent
024af8fede
commit
d093e7d2ff
99
approach.tex
99
approach.tex
@ -1,6 +1,6 @@
|
||||
|
||||
\subsection{Motion R-CNN architecture}
|
||||
\label{ssec:architecture}
|
||||
\subsection{Motion R-CNN}
|
||||
\label{ssec:model}
|
||||
|
||||
Building on Mask R-CNN \cite{MaskRCNN},
|
||||
we estimate per-object motion by predicting the 3D motion of each detected object.
|
||||
@ -26,25 +26,25 @@ C$_4$ & ResNet-50 \{up to C$_4$\} (Table \ref{table:resnet}) & $\tfrac{1}{16}$ H
|
||||
\multicolumn{3}{c}{\textbf{Camera Motion Network}}\\
|
||||
\midrule
|
||||
& From C$_4$: ResNet-50 \{C$_5$\} (Table \ref{table:resnet}) & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 2048 \\
|
||||
& 1 $\times$ 1 conv, 2048 & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 2048 \\
|
||||
& 3 $\times$ 3 conv, 2048, stride 2 & $\tfrac{1}{64}$ H $\times$ $\tfrac{1}{64}$ W $\times$ 2048 \\
|
||||
& average pool & 1 $\times$ 2048 \\
|
||||
M$_1$ & $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & 1 $\times$ 1024 \\
|
||||
& bilinear resize, 7 $\times$ 7 & 7 $\times$ 7 $\times$ 512 \\
|
||||
& flatten & 1 $\times$ 7 $\cdot$ 7 $\cdot$ 512 \\
|
||||
T$_0$ & $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & 1 $\times$ 1024 \\
|
||||
|
||||
$R_t^{cam}$& From M$_1$: fully connected, 3 & 1 $\times$ 3 \\
|
||||
$t_t^{cam}$& From M$_1$: fully connected, 3 & 1 $\times$ 3 \\
|
||||
& From M$_1$: fully connected, 2 & 1 $\times$ 2 \\
|
||||
$R_t^{cam}$& From T$_0$: fully connected, 3 & 1 $\times$ 3 \\
|
||||
$t_t^{cam}$& From T$_0$: fully connected, 3 & 1 $\times$ 3 \\
|
||||
& From T$_0$: fully connected, 2 & 1 $\times$ 2 \\
|
||||
$o_t^{cam}$& softmax, 2 & 1 $\times$ 2 \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{RoI Head \& RoI Head: Masks} (Table \ref{table:maskrcnn_resnet})}\\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{RoI Head: Motions}}\\
|
||||
\midrule
|
||||
M$_2$ & From ave: $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & N$_{RPN}$ $\times$ 1024 \\
|
||||
$\forall k: R_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
|
||||
$\forall k: t_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
|
||||
$\forall k: p_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
|
||||
& From M$_2$: fully connected, 2 & N$_{RPN}$ $\times$ 2 \\
|
||||
& From M$_0$: flatten & N$_{RPN}$ $\times$ 7 $\cdot$ 7 $\cdot$ 256 \\
|
||||
T$_1$ & $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & N$_{RPN}$ $\times$ 1024 \\
|
||||
$\forall k: R_t^k$ & From T$_1$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
|
||||
$\forall k: t_t^k$ & From T$_1$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
|
||||
$\forall k: p_t^k$ & From T$_1$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
|
||||
& From T$_1$: fully connected, 2 & N$_{RPN}$ $\times$ 2 \\
|
||||
$\forall k: o_t^k$ & softmax, 2 & N$_{RPN}$ $\times$ 2 \\
|
||||
|
||||
\bottomrule
|
||||
@ -53,6 +53,8 @@ $\forall k: o_t^k$ & softmax, 2 & N$_{RPN}$ $\times$ 2 \\
|
||||
\caption {
|
||||
Motion R-CNN ResNet-50 architecture based on the Mask R-CNN
|
||||
ResNet-50 architecture (Table \ref{table:maskrcnn_resnet}).
|
||||
We use ReLU activations after all hidden layers and
|
||||
additonally dropout with $p = 0.5$ after all fully-connected hidden layers.
|
||||
}
|
||||
\label{table:motionrcnn_resnet}
|
||||
\end{table}
|
||||
@ -67,30 +69,31 @@ ResNet-50 architecture (Table \ref{table:maskrcnn_resnet}).
|
||||
\midrule\midrule
|
||||
& input images & H $\times$ W $\times$ C \\
|
||||
\midrule
|
||||
C$_5$ & ResNet-50 (Table \ref{table:resnet}) & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 1024 \\
|
||||
C$_5$ & ResNet-50 (Table \ref{table:resnet}) & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 2048 \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{RPN \& FPN} (Table \ref{table:maskrcnn_resnet_fpn})} \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{Camera Motion Network}}\\
|
||||
\midrule
|
||||
& From C$_5$: 1 $\times$ 1 conv, 2048 & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 2048 \\
|
||||
& 3 $\times$ 3 conv, 2048, stride 2 & $\tfrac{1}{64}$ H $\times$ $\tfrac{1}{64}$ W $\times$ 2048 \\
|
||||
& average pool & 1 $\times$ 2048 \\
|
||||
M$_1$ & $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & 1 $\times$ 1024 \\
|
||||
$R_t^{cam}$& From M$_1$: fully connected, 3 & 1 $\times$ 3 \\
|
||||
$t_t^{cam}$& From M$_1$: fully connected, 3 & 1 $\times$ 3 \\
|
||||
& From M$_1$: fully connected, 2 & 1 $\times$ 2 \\
|
||||
& From C$_5$: 1 $\times$ 1 conv, 512 & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 512 \\
|
||||
& bilinear resize, 7 $\times$ 7 & 7 $\times$ 7 $\times$ 512 \\
|
||||
& flatten & 1 $\times$ 7 $\cdot$ 7 $\cdot$ 512 \\
|
||||
T$_2$ & $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & 1 $\times$ 1024 \\
|
||||
$R_t^{cam}$& From T$_2$: fully connected, 3 & 1 $\times$ 3 \\
|
||||
$t_t^{cam}$& From T$_2$: fully connected, 3 & 1 $\times$ 3 \\
|
||||
& From T$_2$: fully connected, 2 & 1 $\times$ 2 \\
|
||||
$o_t^{cam}$& softmax, 2 & 1 $\times$ 2 \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{RoI Head \& RoI Head: Masks} (Table \ref{table:maskrcnn_resnet_fpn})} \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{RoI Head: Motions}}\\
|
||||
\midrule
|
||||
M$_2$ & From F$_1$: $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & N$_{RPN}$ $\times$ 1024 \\
|
||||
$\forall k: R_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
|
||||
$\forall k: t_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
|
||||
$\forall k: p_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
|
||||
& From M$_2$: fully connected, 2 & N$_{RPN}$ $\times$ 2 \\
|
||||
& From M$_1$: flatten & N$_{RPN}$ $\times$ 14 $\cdot$ 14 $\cdot$ 256 \\
|
||||
T$_3$ & $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & N$_{RPN}$ $\times$ 1024 \\
|
||||
$\forall k: R_t^k$ & From T$_3$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
|
||||
$\forall k: t_t^k$ & From T$_3$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
|
||||
$\forall k: p_t^k$ & From T$_3$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
|
||||
& From T$_2$: fully connected, 2 & N$_{RPN}$ $\times$ 2 \\
|
||||
$\forall k: o_t^k$ & softmax, 2 & N$_{RPN}$ $\times$ 2 \\
|
||||
|
||||
\bottomrule
|
||||
@ -101,12 +104,14 @@ Motion R-CNN ResNet-50-FPN architecture based on the Mask R-CNN
|
||||
ResNet-50-FPN architecture (Table \ref{table:maskrcnn_resnet_fpn}).
|
||||
The modifications are analogous to our Motion R-CNN ResNet-50,
|
||||
but we still show the architecture for completeness.
|
||||
Again, we use ReLU activations after all hidden layers and
|
||||
additonally dropout with $p = 0.5$ after all fully-connected hidden layers.
|
||||
}
|
||||
\label{table:motionrcnn_resnet_fpn}
|
||||
\end{table}
|
||||
}
|
||||
|
||||
\paragraph{Backbone Network}
|
||||
\paragraph{Motion R-CNN backbone}
|
||||
Like Faster R-CNN and Mask R-CNN, we use a ResNet \cite{ResNet} variant as backbone network to compute feature maps from input imagery.
|
||||
|
||||
Inspired by FlowNetS \cite{FlowNet}, we make one modification to the ResNet backbone to enable image matching,
|
||||
@ -165,8 +170,9 @@ R_t^{k,z}(\gamma) =
|
||||
|
||||
and $\alpha, \beta, \gamma$ are the rotation angles in radians about the $x,y,z$-axis, respectively.
|
||||
|
||||
We then extend the Mask R-CNN head by adding a fully connected layer in parallel to the fully connected layers for
|
||||
refined boxes and classes.
|
||||
We then extend the Mask R-CNN head by adding a small fully-connected network for motion
|
||||
prediction in addition to the fully-connected layers for
|
||||
refined boxes and classes and the convolutional network for the masks.
|
||||
Like for refined boxes and masks, we make one separate motion prediction for each class.
|
||||
Each instance motion is predicted as a set of nine scalar parameters,
|
||||
$\sin(\alpha)$, $\sin(\beta)$, $\sin(\gamma)$, $t_t^k$ and $p_t^k$,
|
||||
@ -180,15 +186,46 @@ still and moving objects. As a postprocessing, for any object instance $k$ with
|
||||
we set $\sin(\alpha) = \sin(\beta) = \sin(\gamma) = 0$ and $t_t^k = (0,0,0)^T$,
|
||||
and thus predict an identity motion.
|
||||
|
||||
|
||||
\paragraph{Camera motion prediction}
|
||||
In addition to the object transformations, we optionally predict the camera motion $\{R_t^{cam}, t_t^{cam}\}\in \mathbf{SE}(3)$
|
||||
between the two frames $I_t$ and $I_{t+1}$.
|
||||
For this, we flatten the bottleneck output of the backbone and pass it through a fully connected layer.
|
||||
For this, we branch off a small fully-connected network from the bottleneck output of the backbone.
|
||||
We again represent $R_t^{cam}$ using a Euler angle representation and
|
||||
predict $\sin(\alpha)$, $\sin(\beta)$, $\sin(\gamma)$ and $t_t^{cam}$ in the same way as for the individual objects.
|
||||
Again, we predict a softmax score $o_t^{cam}$ for differentiating between
|
||||
a still and moving camera.
|
||||
|
||||
\subsection{Motion R-CNN network design}
|
||||
|
||||
\label{ssec:design}
|
||||
\paragraph{Camera motion network}
|
||||
In our ResNet-50 variant (Table \ref{table:motionrcnn_resnet}), the underlying
|
||||
ResNet backbone is only computed up to the $C_4$ block, as otherwise the
|
||||
feature resolution for RoI extraction would be reduced too much.
|
||||
In our ResNet-50 variant, we first pass the $C_4$ features through a $C_5$
|
||||
block to make the camera network of both variants comparable.
|
||||
Then, in both, the ResNet-50 and ResNet-50-FPN variant (Table \ref{table:motionrcnn_resnet_fpn}), we apply a additional
|
||||
convolution to the $C_5$ features to reduce the number of inputs to the following
|
||||
fully-connected layers.
|
||||
Instead of averaging, we use bilinear resizing to bring the convolutional features
|
||||
to a fixed size without losing spatial information,
|
||||
flatten them, and finally apply multiple fully-connected layers to compute the
|
||||
camera motion prediction.
|
||||
|
||||
\paragraph{RoI motion head network}
|
||||
In both of our network variants
|
||||
(Tables \ref{table:motionrcnn_resnet} and \ref{table:motionrcnn_resnet_fpn}),
|
||||
we compute the fully-connected network for motion prediction from the
|
||||
convolutional mask features, branching off right before the mask upsampling
|
||||
deconvolution. The intuition behind this is that the final mask features contain
|
||||
high resolution, spatial information about which positions belong to the object and
|
||||
which belong to the background. Thus, we allow the motion estimation network to
|
||||
make use of this data and ideally integrate the motion (image matching) information
|
||||
localized within the object, but not that belonging to the background,
|
||||
into the final object motion estimate.
|
||||
|
||||
|
||||
\subsection{Supervision}
|
||||
\label{ssec:supervision}
|
||||
|
||||
|
||||
@ -22,7 +22,7 @@ Deep convolutional neural network (CNN) architectures
|
||||
became widely popular through numerous successes in classification and recognition tasks.
|
||||
The general structure of a CNN consists of a convolutional encoder, which
|
||||
learns a spatially compressed, wide (in the number of channels) representation of the input image,
|
||||
and a fully connected prediction network on top of the encoder.
|
||||
and a fully-connected prediction network on top of the encoder.
|
||||
|
||||
The compressed representations learned by CNNs of these categories do not, however, allow
|
||||
for prediction of high-resolution output, as spatial detail is lost through sequential applications
|
||||
@ -190,6 +190,7 @@ Operations enclosed in a []$_b$ block make up a single ResNet \enquote{bottlenec
|
||||
block (see Figure \ref{figure:bottleneck}). If the block is denoted as []$_b/2$,
|
||||
the first conv operation in the block has a stride of 2. Note that the stride
|
||||
is only applied to the first block, but not to repeated blocks.
|
||||
Batch normalization \cite{BN} is used after every convolution.
|
||||
}
|
||||
\label{table:resnet}
|
||||
\end{longtable}
|
||||
@ -317,7 +318,7 @@ classes& softmax, N$_{cls}$ & N$_{RoI}$ $\times$ N$_{cls}$ \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{RoI Head: Masks}}\\
|
||||
\midrule
|
||||
& From R$_1$: 2 $\times$ 2 deconv, 256, stride 2 & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ 256 \\
|
||||
M$_0$ & From R$_1$: 2 $\times$ 2 deconv, 256, stride 2 & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ 256 \\
|
||||
& 1 $\times$ 1 conv, N$_{cls}$ & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ N$_{cls}$ \\
|
||||
masks & sigmoid, N$_{cls}$ & N$_{RoI}$ $\times$ 28 $\times$ 28 $\times$ N$_{cls}$ \\
|
||||
|
||||
@ -410,7 +411,7 @@ classes& softmax, N$_{cls}$ & N$_{RoI}$ $\times$ N$_{cls}$ \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{RoI Head: Masks}}\\
|
||||
\midrule
|
||||
& From R$_2$: $\begin{bmatrix}\textrm{3 $\times$ 3 conv} \end{bmatrix}$ $\times$ 4, 256 & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ 256 \\
|
||||
M$_1$ & From R$_2$: $\begin{bmatrix}\textrm{3 $\times$ 3 conv} \end{bmatrix}$ $\times$ 4, 256 & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ 256 \\
|
||||
& 2 $\times$ 2 deconv, 256, stride 2 & N$_{RoI}$ $\times$ 28 $\times$ 28 $\times$ 256 \\
|
||||
& 1 $\times$ 1 conv, N$_{cls}$ & N$_{RoI}$ $\times$ 28 $\times$ 28 $\times$ N$_{cls}$ \\
|
||||
masks & sigmoid, N$_{cls}$ & N$_{RoI}$ $\times$ 28 $\times$ 28 $\times$ N$_{cls}$ \\
|
||||
|
||||
12
bib.bib
12
bib.bib
@ -237,3 +237,15 @@
|
||||
title = {End-to-End Learning of Geometry and Context for Deep Stereo Regression},
|
||||
booktitle = {CVPR},
|
||||
year = {2017}}
|
||||
|
||||
@inproceedings{BN,
|
||||
author = {Sergey Ioffe and Christian Szegedy},
|
||||
title = {Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift},
|
||||
booktitle = {ICML},
|
||||
year = {2015}}
|
||||
|
||||
@inproceedings{He,
|
||||
author = {Kaiming He and Xiangyu Zhang and Shaoqing Ren and Jian Sun},
|
||||
title = {Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification},
|
||||
booktitle = {ICCV},
|
||||
year = {2015}}
|
||||
|
||||
@ -41,9 +41,6 @@ to be used as validation set. From the remaining 2026 examples,
|
||||
we remove a small number of examples without object instances and use the resulting
|
||||
data as training set.
|
||||
|
||||
\paragraph{KITTI 2015}
|
||||
\todo{add this if adding KITTI 2015 evaluations}
|
||||
|
||||
\paragraph{Motion ground truth from 3D poses and camera extrinsics}
|
||||
We will now describe how we use the ground truth poses and camera matrices from Virtual KITTI to
|
||||
compute instance and camera motion ground truth.
|
||||
@ -161,7 +158,20 @@ As learning rate we use $0.25 \cdot 10^{-2}$ for the
|
||||
first 144K iterations and $0.25 \cdot 10^{-3}$ for all remaining iterations.
|
||||
|
||||
\paragraph{R-CNN training parameters}
|
||||
\todo{add this}
|
||||
For training the RPN and RoI heads and during inference,
|
||||
we use the exact same number of proposals and RoIs as Mask R-CNN in
|
||||
the ResNet-50 and ResNet-50-FPN variants, respectively.
|
||||
|
||||
\paragraph{Initialization}
|
||||
Following the pre-existing TensorFlow implementation of Faster R-CNN,
|
||||
we initialize all hidden layers with He initialization \cite{He}.
|
||||
For the fully-connected camera and instance motion output layers,
|
||||
we use a truncated normal initializer with a standard
|
||||
deviation of $0.0001$ and zero mean, truncated at two standard deviations.
|
||||
Note that a larger weight prevented the
|
||||
angle sine estimates from properly converging to the very small values they
|
||||
are in general expected to output.
|
||||
|
||||
|
||||
\subsection{Experiments on Virtual KITTI}
|
||||
\label{ssec:vkitti}
|
||||
@ -221,7 +231,3 @@ Figure \ref{figure:vkitti} visualizes instance segmentation and optical flow
|
||||
results on the Virtual KITTI validation set.
|
||||
Table \ref{table:vkitti} compares the performance of different network variants on the Virtual KITTI validation
|
||||
set.
|
||||
|
||||
\subsection{Evaluation on KITTI 2015}
|
||||
\todo{if there is enough time,
|
||||
add evaluation (table + visualizations) of instance segmentation and composed optical flow on RGB-D frames from KITTI 2015 train}
|
||||
|
||||
@ -40,7 +40,7 @@ the location and 3D motion of each object instance relative to the camera
|
||||
Recently, SfM-Net \cite{SfmNet} introduced an end-to-end deep learning approach for predicting depth
|
||||
and dense optical flow in monocular image sequences based on estimating the 3D motion of individual objects and the camera.
|
||||
SfM-Net predicts a batch of binary full image masks specyfing the object memberships of individual pixels with a standard encoder-decoder
|
||||
network for pixel-wise prediction. A fully connected network branching off the encoder predicts a 3D motion for each object.
|
||||
network for pixel-wise prediction. A fully-connected network branching off the encoder predicts a 3D motion for each object.
|
||||
However, due to the fixed number of objects masks, the system can only predict a small number of motions and
|
||||
often fails to properly segment the pixels into the correct masks or assigns background pixels to object motions (Figure \ref{figure:sfmnet_kitti}).
|
||||
\begin{figure}[t]
|
||||
@ -95,8 +95,8 @@ manageable pieces.
|
||||
\includegraphics[width=\textwidth]{figures/net_intro}
|
||||
\caption{
|
||||
Overview of our network based on Mask R-CNN. For each RoI, we predict the instance motion
|
||||
in parallel to the class, bounding box and mask. We branch off a fully connected
|
||||
layer for predicting the camera motion from the bottleneck.
|
||||
in parallel to the class, bounding box and mask. We branch off a additionaly
|
||||
small network for predicting the camera motion from the bottleneck.
|
||||
}
|
||||
\label{figure:net_intro}
|
||||
\end{figure}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user