mirror of
https://github.com/tu-darmstadt-informatik/bsc-thesis.git
synced 2025-12-13 01:45:50 +00:00
WIP
This commit is contained in:
parent
c1efb75b1a
commit
f8eab9559a
143
approach.tex
143
approach.tex
@ -8,7 +8,99 @@ For this, we extend Mask R-CNN in two straightforward ways.
|
||||
First, we modify the backbone network and provide two frames to the R-CNN system
|
||||
in order to enable image matching between the consecutive frames.
|
||||
Second, we extend the Mask R-CNN RoI head to predict a 3D motion for each
|
||||
region proposal.
|
||||
region proposal. Table \ref{table:motionrcnn_resnet} shows the modified network.
|
||||
|
||||
{
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\begin{tabular}{llr}
|
||||
\toprule
|
||||
\textbf{Layer ID} & \textbf{Layer Operations} & \textbf{Output Dimensions} \\
|
||||
\midrule\midrule
|
||||
& input image & H $\times$ W $\times$ C \\
|
||||
\midrule
|
||||
C$_4$ & ResNet-50 \{up to C$_4$\} (Table \ref{table:resnet}) & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 1024 \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{Region Proposal Network (RPN)} (Table \ref{table:maskrcnn_resnet})}\\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{Camera Motion Network}}\\
|
||||
\midrule
|
||||
& From C$_4$: ResNet-50 \{C$_5$\} (Table \ref{table:resnet}) & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 2048 \\
|
||||
& 1 $\times$ 1 conv, 1024 & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 1024 \\
|
||||
& 3 $\times$ 3 conv, 1024, stride 2 & $\tfrac{1}{64}$ H $\times$ $\tfrac{1}{64}$ W $\times$ 1024 \\
|
||||
& average pool & 1 $\times$ 2048 \\
|
||||
M$_1$ & $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & 1 $\times$ 1024 \\
|
||||
|
||||
$R_t^{cam}$& From M$_1$: fully connected, 3 & 1 $\times$ 3 \\
|
||||
$t_t^{cam}$& From M$_1$: fully connected, 3 & 1 $\times$ 3 \\
|
||||
$o_t^{cam}$& From M$_1$: fully connected, 2 & 1 $\times$ 2 \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{RoI Head \& RoI Head: Masks} (Table \ref{table:maskrcnn_resnet})}\\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{RoI Head: Motions}}\\
|
||||
\midrule
|
||||
M$_2$ & From ave: $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & N$_{RPN}$ $\times$ 1024 \\
|
||||
$\forall k: R_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
|
||||
$\forall k: t_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
|
||||
$\forall k: p_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
|
||||
$\forall k: o_t^k$ & From M$_2$: fully connected, 2 & N$_{RPN}$ $\times$ 2 \\
|
||||
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
|
||||
\caption {
|
||||
Motion R-CNN ResNet-50 architecture based on the Mask R-CNN
|
||||
ResNet-50 architecture (Table \ref{table:maskrcnn_resnet}).
|
||||
}
|
||||
\label{table:motionrcnn_resnet}
|
||||
\end{table}
|
||||
}
|
||||
|
||||
{
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\begin{tabular}{llr}
|
||||
\toprule
|
||||
\textbf{Layer ID} & \textbf{Layer Operations} & \textbf{Output Dimensions} \\
|
||||
\midrule\midrule
|
||||
& input image & H $\times$ W $\times$ C \\
|
||||
\midrule
|
||||
C$_5$ & ResNet-50 (Table \ref{table:resnet}) & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 1024 \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{RPN \& FPN} (Table \ref{table:maskrcnn_resnet_fpn})} \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{Camera Motion Network}}\\
|
||||
\midrule
|
||||
& From C$_5$: 1 $\times$ 1 conv, 1024 & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 1024 \\
|
||||
& 3 $\times$ 3 conv, 1024, stride 2 & $\tfrac{1}{64}$ H $\times$ $\tfrac{1}{64}$ W $\times$ 1024 \\
|
||||
& average pool & 1 $\times$ 2048 \\
|
||||
M$_1$ & $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & 1 $\times$ 1024 \\
|
||||
$R_t^{cam}$& From M$_1$: fully connected, 3 & 1 $\times$ 3 \\
|
||||
$t_t^{cam}$& From M$_1$: fully connected, 3 & 1 $\times$ 3 \\
|
||||
$o_t^{cam}$& From M$_1$: fully connected, 2 & 1 $\times$ 2 \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{RoI Head \& RoI Head: Masks} (Table \ref{table:maskrcnn_resnet_fpn})} \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{RoI Head: Motions}}\\
|
||||
\midrule
|
||||
M$_2$ & From F$_1$: $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & N$_{RPN}$ $\times$ 1024 \\
|
||||
$\forall k: R_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
|
||||
$\forall k: t_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
|
||||
$\forall k: p_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
|
||||
$\forall k: o_t^k$ & From M$_2$: fully connected, 2 & N$_{RPN}$ $\times$ 2 \\
|
||||
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
|
||||
\caption {
|
||||
Motion R-CNN ResNet-50-FPN architecture based on the Mask R-CNN
|
||||
ResNet-50-FPN architecture (Table \ref{table:maskrcnn_resnet_fpn}).
|
||||
The modifications are analogous to our Motion R-CNN ResNet-50,
|
||||
but we still show the architecture for completeness.
|
||||
}
|
||||
\label{table:motionrcnn_resnet_fpn}
|
||||
\end{table}
|
||||
}
|
||||
|
||||
\paragraph{Backbone Network}
|
||||
Like Faster R-CNN and Mask R-CNN, we use a ResNet \cite{ResNet} variant as backbone network to compute feature maps from input imagery.
|
||||
@ -70,7 +162,7 @@ R_t^{k,z}(\gamma) =
|
||||
and $\alpha, \beta, \gamma$ are the rotation angles in radians about the $x,y,z$-axis, respectively.
|
||||
|
||||
We then extend the Mask R-CNN head by adding a fully connected layer in parallel to the fully connected layers for
|
||||
refined boxes and classes. Figure \ref{fig:motion_rcnn_head} shows the Motion R-CNN RoI head.
|
||||
refined boxes and classes.
|
||||
Like for refined boxes and masks, we make one separate motion prediction for each class.
|
||||
Each instance motion is predicted as a set of nine scalar parameters,
|
||||
$\sin(\alpha)$, $\sin(\beta)$, $\sin(\gamma)$, $t_t^k$ and $p_t^k$,
|
||||
@ -81,7 +173,6 @@ which is in general a safe assumption for image sequences from videos.
|
||||
All predictions are made in camera space, and translation and pivot predictions are in meters.
|
||||
We additionally predict softmax scores $o_t^k$ for classifying the objects into
|
||||
still and moving objects.
|
||||
\todo{figure of head}
|
||||
|
||||
\paragraph{Camera motion prediction}
|
||||
In addition to the object transformations, we optionally predict the camera motion $\{R_t^{cam}, t_t^{cam}\}\in \mathbf{SE}(3)$
|
||||
@ -92,52 +183,6 @@ predict $\sin(\alpha)$, $\sin(\beta)$, $\sin(\gamma)$ and $t_t^{cam}$ in the sam
|
||||
Again, we predict a softmax score $o_t^{cam}$ for classifying differentiating between
|
||||
a still and moving camera.
|
||||
|
||||
{
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\begin{tabular}{llr}
|
||||
layer id & layer operations & output dimensions \\
|
||||
\toprule \\
|
||||
& input image & H $\times$ W $\times$ C \\
|
||||
\midrule \\
|
||||
C$_4$ & \textbf{ResNet-50} [up to C$_4$] & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 1024 \\
|
||||
\midrule \\
|
||||
\multicolumn{3}{c}{\textbf{Region Proposal Network (RPN)} (see Table \ref{table:maskrcnn_resnet})}\\
|
||||
\midrule \\
|
||||
\multicolumn{3}{c}{\textbf{Camera Motion Network}}\\
|
||||
\midrule \\
|
||||
& From C$_4$: \textbf{ResNet-50} [C$_5$ without stride] & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 2048 \\
|
||||
& average pool & 1 $\times$ 2048 \\
|
||||
& fully connected, 1024 & 1 $\times$ 1024 \\
|
||||
M$_1$ & fully connected, 1024 & 1 $\times$ 1024 \\
|
||||
$R_t^{cam}$& From M$_1$: fully connected, 3 & 1 $\times$ 3 \\
|
||||
$t_t^{cam}$& From M$_1$: fully connected, 3 & 1 $\times$ 3 \\
|
||||
$o_t^{cam}$& From M$_1$: fully connected, 2 & 1 $\times$ 2 \\
|
||||
\midrule \\
|
||||
\multicolumn{3}{c}{\textbf{RoI Head} (see Table \ref{table:maskrcnn_resnet})}\\
|
||||
\midrule \\
|
||||
\multicolumn{3}{c}{\textbf{RoI Head: Masks} (see Table \ref{table:maskrcnn_resnet})}\\
|
||||
\midrule \\
|
||||
\multicolumn{3}{c}{\textbf{RoI Head: Motions}}\\
|
||||
\midrule \\
|
||||
& From ave: fully connected, 1024 & N$_{RPN}$ $\times$ 1024 \\
|
||||
M$_2$ & fully connected, 1024 & N$_{RPN}$ $\times$ 1024 \\
|
||||
$\forall k: R_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
|
||||
$\forall k: t_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
|
||||
$\forall k: p_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
|
||||
$\forall k: o_t^k$ & From M$_2$: fully connected, 2 & N$_{RPN}$ $\times$ 2 \\
|
||||
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
|
||||
\caption {
|
||||
Motion R-CNN ResNet architecture based on the Mask R-CNN
|
||||
ResNet architecture (Table \ref{table:maskrcnn_resnet}).
|
||||
}
|
||||
\label{table:motion_rcnn_resnet}
|
||||
\end{table}
|
||||
}
|
||||
|
||||
\subsection{Supervision}
|
||||
\label{ssec:supervision}
|
||||
|
||||
|
||||
202
background.tex
202
background.tex
@ -76,15 +76,16 @@ Figure \ref{figure:bottleneck}
|
||||
shows the fundamental building block of ResNet-50.
|
||||
|
||||
{
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\begin{tabular}{llr}
|
||||
layer id & layer operations & output dimensions \\
|
||||
\toprule \\
|
||||
%\begin{table}[h]
|
||||
%\centering
|
||||
\begin{longtable}{llr}
|
||||
\toprule
|
||||
\textbf{Layer ID} & \textbf{Layer Operations} & \textbf{Output Dimensions} \\
|
||||
\midrule\midrule
|
||||
& input image & H $\times$ W $\times$ C \\
|
||||
\midrule \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{ResNet-50}}\\
|
||||
\midrule \\
|
||||
\midrule
|
||||
C$_1$ & 7 $\times$ 7 conv, 64, stride 2 & $\tfrac{1}{2}$ H $\times$ $\tfrac{1}{2}$ W $\times$ 64 \\
|
||||
|
||||
& 3 $\times$ 3 max pool, stride 2 & $\tfrac{1}{4}$ H $\times$ $\tfrac{1}{4}$ W $\times$ 64 \\
|
||||
@ -96,7 +97,7 @@ $\begin{bmatrix}
|
||||
1 \times 1, 256 \\
|
||||
\end{bmatrix}_b$ $\times$ 3
|
||||
& $\tfrac{1}{4}$ H $\times$ $\tfrac{1}{4}$ W $\times$ 256 \\
|
||||
\midrule \\
|
||||
\midrule
|
||||
C$_3$ &
|
||||
$\begin{bmatrix}
|
||||
1 \times 1, 128 \\
|
||||
@ -104,7 +105,7 @@ $\begin{bmatrix}
|
||||
1 \times 1, 512 \\
|
||||
\end{bmatrix}_{b/2}$ $\times$ 4
|
||||
& $\tfrac{1}{8}$ H $\times$ $\tfrac{1}{8}$ W $\times$ 512 \\
|
||||
\midrule \\
|
||||
\midrule
|
||||
C$_4$ &
|
||||
$\begin{bmatrix}
|
||||
1 \times 1, 256 \\
|
||||
@ -112,7 +113,7 @@ $\begin{bmatrix}
|
||||
1 \times 1, 1024 \\
|
||||
\end{bmatrix}_{b/2}$ $\times$ 6
|
||||
& $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 1024 \\
|
||||
\midrule \\
|
||||
\midrule
|
||||
C$_5$ &
|
||||
$\begin{bmatrix}
|
||||
1 \times 1, 512 \\
|
||||
@ -122,24 +123,26 @@ $\begin{bmatrix}
|
||||
& $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 1024 \\
|
||||
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
|
||||
\caption {
|
||||
ResNet-50 \cite{ResNet} architecture.
|
||||
ResNet-50 \cite{ResNet} architecture (Figure from \cite{ResNet}).
|
||||
Operations enclosed in a []$_b$ block make up a single ResNet \enquote{bottleneck}
|
||||
block (see Figure \ref{figure:bottleneck}). If the block is denoted as []$_b/2$,
|
||||
the first conv operation in the block has a stride of 2. Note that the stride
|
||||
is only applied to the first block, but not to repeated blocks.
|
||||
}
|
||||
\label{table:resnet}
|
||||
\end{table}
|
||||
\end{longtable}
|
||||
|
||||
|
||||
%\end{table}
|
||||
}
|
||||
|
||||
\begin{figure}[t]
|
||||
\centering
|
||||
\includegraphics[width=0.3\textwidth]{figures/bottleneck}
|
||||
\caption{
|
||||
ResNet \cite{ResNet} \enquote{bottleneck} block introduced to reduce computational
|
||||
ResNet \cite{ResNet} \enquote{bottleneck} convolutional block introduced to reduce computational
|
||||
complexity in deeper network variants, shown here with 256 input and output channels.
|
||||
}
|
||||
\label{figure:bottleneck}
|
||||
@ -193,21 +196,6 @@ The \emph{second stage} corresponds to the original Fast R-CNN head network, per
|
||||
and bounding box refinement for each region proposal. % TODO verify that it isn't modified
|
||||
As in Fast R-CNN, RoI pooling is used to crop one fixed size feature map for each of the region proposals.
|
||||
|
||||
\paragraph{Feature Pyramid Networks}
|
||||
In Faster R-CNN, a single feature map is used as a source of all RoIs, independent
|
||||
of the size of the bounding box of the RoI.
|
||||
However, for small objects, the C4 \todo{explain terminology of layers} features
|
||||
might have lost too much spatial information to properly predict the exact bounding
|
||||
box and a high resolution mask. Likewise, for very big objects, the fixed size
|
||||
RoI window might be too small to cover the region of the feature map containing
|
||||
information for this object.
|
||||
As a solution to this, the Feature Pyramid Network (FPN) \cite{FPN} enable features
|
||||
of an appropriate scale to be used, depending of the size of the bounding box.
|
||||
For this, a pyramid of feature maps is created on top of the ResNet \cite{ResNet}
|
||||
encoder. \todo{figure and more details}
|
||||
Now, during RoI pooling,
|
||||
\todo{show formula}.
|
||||
|
||||
|
||||
\paragraph{Mask R-CNN}
|
||||
Faster R-CNN and the earlier systems detect and classify objects at bounding box granularity.
|
||||
@ -218,54 +206,137 @@ Mask R-CNN \cite{MaskRCNN} extends the Faster R-CNN system to instance segmentat
|
||||
fixed resolution instance masks within the bounding boxes of each detected object.
|
||||
This is done by simply extending the Faster R-CNN head with multiple convolutions, which
|
||||
compute a pixel-precise mask for each instance.
|
||||
In addition to extending the original Faster R-CNN head, Mask R-CNN also introduced a network
|
||||
variant based on Feature Pyramid Networks \cite{FPN}.
|
||||
Figure \ref{} compares the two Mask R-CNN head variants.
|
||||
The basic Mask R-CNN ResNet-50 architecture is shown in Table \ref{table:maskrcnn_resnet}.
|
||||
\todo{RoI Align}
|
||||
|
||||
{
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\begin{tabular}{llr}
|
||||
layer id & layer operations & output dimensions \\
|
||||
\toprule \\
|
||||
%\begin{table}[t]
|
||||
%\centering
|
||||
\begin{longtable}{llr}
|
||||
\toprule
|
||||
\textbf{Layer ID} & \textbf{Layer Operations} & \textbf{Output Dimensions} \\
|
||||
\midrule\midrule
|
||||
& input image & H $\times$ W $\times$ C \\
|
||||
\midrule \\
|
||||
C$_4$ & \textbf{ResNet-50} [up to C$_4$] & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 1024 \\
|
||||
\midrule \\
|
||||
\midrule
|
||||
C$_4$ & ResNet-50 \{up to C$_4$\} (Table \ref{table:resnet}) & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 1024 \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{Region Proposal Network (RPN)}}\\
|
||||
\midrule \\
|
||||
\midrule
|
||||
& From C$_4$: 1 $\times$ 1 conv, 512 & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 512 \\
|
||||
& 1 $\times$ 1 conv, 4 & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 4 \\
|
||||
& flatten & A $\times$ 4 \\
|
||||
& decode bounding boxes \ref{} & A $\times$ 4 \\
|
||||
boxes$_{\mathrm{RPN}}$ & sample bounding boxes \ref{} & N$_{RPN}$ $\times$ 4 \\
|
||||
\midrule \\
|
||||
& 1 $\times$ 1 conv, 6 & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 6 \\
|
||||
& flatten & A $\times$ 6 \\
|
||||
& decode bounding boxes (Eq. \ref{eq:pred_bounding_box}) & A $\times$ 6 \\
|
||||
ROI$_{\mathrm{RPN}}$ & sample bounding boxes \& scores (Listing \ref{}) & N$_{RPN}$ $\times$ 6 \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{RoI Head}}\\
|
||||
\midrule \\
|
||||
& From C$_4$ with boxes$_{\mathrm{RPN}}$: RoI pooling \ref{} & N$_{RPN}$ $\times$ 7 $\times$ 7 $\times$ 1024 \\
|
||||
R$_1$& \textbf{ResNet-50} [C$_5$ without stride] & N$_{RPN}$ $\times$ 7 $\times$ 7 $\times$ 2048 \\
|
||||
\midrule
|
||||
& From C$_4$ with ROI$_{\mathrm{RPN}}$: RoI pooling (\ref{}) & N$_{RPN}$ $\times$ 7 $\times$ 7 $\times$ 1024 \\
|
||||
R$_1$& ResNet-50 \{C$_5$ without stride\} (Table \ref{table:resnet}) & N$_{RPN}$ $\times$ 7 $\times$ 7 $\times$ 2048 \\
|
||||
ave & average pool & N$_{RPN}$ $\times$ 2048 \\
|
||||
boxes& From ave: fully connected, 4 & N$_{RPN}$ $\times$ 4 \\
|
||||
logits& From ave: fully connected, N$_{cls}$ & N$_{RPN}$ $\times$ N$_{cls}$ \\
|
||||
\midrule \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{RoI Head: Masks}}\\
|
||||
\midrule \\
|
||||
\midrule
|
||||
& From R$_1$: 2 $\times$ 2 deconv, 256, stride 2 & N$_{RPN}$ $\times$ 14 $\times$ 14 $\times$ 256 \\
|
||||
masks & 1 $\times$ 1 conv, N$_{cls}$ & N$_{RPN}$ $\times$ 14 $\times$ 14 $\times$ N$_{cls}$ \\
|
||||
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
|
||||
\caption {
|
||||
Mask R-CNN \cite{MaskRCNN} ResNet \cite{ResNet} architecture.
|
||||
Mask R-CNN \cite{MaskRCNN} ResNet-50 \cite{ResNet} architecture.
|
||||
Note that this is equivalent to the Faster R-CNN architecture if the mask
|
||||
head is left out.
|
||||
}
|
||||
\label{table:maskrcnn_resnet}
|
||||
\end{table}
|
||||
\end{longtable}
|
||||
%\end{table}
|
||||
}
|
||||
|
||||
\paragraph{Feature Pyramid Networks}
|
||||
In Faster R-CNN, a single feature map is used as a source of all RoIs, independent
|
||||
of the size of the bounding box of the RoI.
|
||||
However, for small objects, the C$_4$ (see Table \ref{table:maskrcnn_resnet}) features
|
||||
might have lost too much spatial information to properly predict the exact bounding
|
||||
box and a high resolution mask. Likewise, for very big objects, the fixed size
|
||||
RoI window might be too small to cover the region of the feature map containing
|
||||
information for this object.
|
||||
As a solution to this, the Feature Pyramid Network (FPN) \cite{FPN} enable features
|
||||
of an appropriate scale to be used, depending of the size of the bounding box.
|
||||
For this, a pyramid of feature maps is created on top of the ResNet \cite{ResNet}
|
||||
encoder. \todo{figure and more details}
|
||||
Now, during RoI pooling,
|
||||
\todo{show formula}.
|
||||
The Mask R-CNN ResNet-50-FPN variant is shown in Table \ref{table:maskrcnn_resnet_fpn}.
|
||||
|
||||
{
|
||||
%\begin{table}[t]
|
||||
%\centering
|
||||
\begin{longtable}{llr}
|
||||
\toprule
|
||||
\textbf{Layer ID} & \textbf{Layer Operations} & \textbf{Output Dimensions} \\
|
||||
\midrule\midrule
|
||||
& input image & H $\times$ W $\times$ C \\
|
||||
\midrule
|
||||
C$_5$ & ResNet-50 (Table \ref{table:resnet}) & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 1024 \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{Feature Pyramid Network (FPN)}}\\
|
||||
\midrule
|
||||
P$_5$ & From C$_5$: 1 $\times$ 1 conv, 256 & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 256 \\
|
||||
P$_4$ & $\begin{bmatrix}\textrm{skip from C$_4$}\end{bmatrix}_p$ & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 256 \\
|
||||
P$_3$ & $\begin{bmatrix}\textrm{skip from C$_3$}\end{bmatrix}_p$ & $\tfrac{1}{8}$ H $\times$ $\tfrac{1}{8}$ W $\times$ 256 \\
|
||||
P$_2$ & $\begin{bmatrix}\textrm{skip from C$_2$}\end{bmatrix}_p$ & $\tfrac{1}{4}$ H $\times$ $\tfrac{1}{4}$ W $\times$ 256 \\
|
||||
P$_6$ & From P$_5$: 2 $\times$ 2 subsample, 256 & $\tfrac{1}{64}$ H $\times$ $\tfrac{1}{64}$ W $\times$ 256 \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{Region Proposal Network (RPN)}}\\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{$\forall i \in \{2...6\}$}\\
|
||||
& From P$_i$: 1 $\times$ 1 conv, 512 & $\tfrac{1}{2^i}$ H $\times$ $\tfrac{1}{2^i}$ W $\times$ 512 \\
|
||||
& 1 $\times$ 1 conv, 6 & $\tfrac{1}{2^i}$ H $\times$ $\tfrac{1}{2^i}$ W $\times$ 6 \\
|
||||
RPN$_i$& flatten & A$_i$ $\times$ 6 \\
|
||||
\midrule
|
||||
& From \{RPN$_2$ ... RPN$_6$\}: concatenate & A $\times$ 6 \\
|
||||
& decode bounding boxes (Eq. \ref{eq:pred_bounding_box}) & A $\times$ 6 \\
|
||||
ROI$_{\mathrm{RPN}}$ & sample bounding boxes \& scores (Listing \ref{}) & N$_{RPN}$ $\times$ 6 \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{RoI Head}}\\
|
||||
\midrule
|
||||
R$_2$ & From \{P$_2$ ... P$_6$\} with ROI$_{\mathrm{RPN}}$: FPN RoI crop & N$_{RPN}$ $\times$ 14 $\times$ 14 $\times$ 256 \\
|
||||
& 2 $\times$ 2 max pool & N$_{RPN}$ $\times$ 7 $\times$ 7 $\times$ 256 \\
|
||||
F$_1$ & $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & N$_{RPN}$ $\times$ 1024 \\
|
||||
boxes& From F$_1$: fully connected, 4 & N$_{RPN}$ $\times$ 4 \\
|
||||
logits& From F$_1$: fully connected, N$_{cls}$ & N$_{RPN}$ $\times$ N$_{cls}$ \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{RoI Head: Masks}}\\
|
||||
\midrule
|
||||
& From R$_2$: $\begin{bmatrix}\textrm{3 $\times$ 3 conv} \end{bmatrix}$ $\times$ 4, 256 & N$_{RPN}$ $\times$ 14 $\times$ 14 $\times$ 256 \\
|
||||
& 2 $\times$ 2 deconv, 256, stride 2 & N$_{RPN}$ $\times$ 28 $\times$ 28 $\times$ 256 \\
|
||||
masks & 1 $\times$ 1 conv, N$_{cls}$ & N$_{RPN}$ $\times$ 28 $\times$ 28 $\times$ N$_{cls}$ \\
|
||||
\bottomrule
|
||||
|
||||
\caption {
|
||||
Mask R-CNN \cite{MaskRCNN} ResNet-50-FPN \cite{ResNet} architecture.
|
||||
Operations enclosed in a []$_p$ block make up a single FPN
|
||||
block (see Figure \ref{figure:fpn_block}).
|
||||
}
|
||||
\label{table:maskrcnn_resnet_fpn}
|
||||
\end{longtable}
|
||||
%\end{table}
|
||||
}
|
||||
|
||||
\begin{figure}[t]
|
||||
\centering
|
||||
\includegraphics[width=0.3\textwidth]{figures/fpn}
|
||||
\caption{
|
||||
FPN block from \cite{FPN}.
|
||||
Lower resolution features coming from the bottleneck are bilinearly upsampled
|
||||
and added with higher resolution skip connections from the encoder.
|
||||
}
|
||||
\label{figure:fpn_block}
|
||||
\end{figure}
|
||||
|
||||
\subsection{Training Mask R-CNN}
|
||||
\label{ssec:rcnn_techn}
|
||||
\paragraph{Bounding box regression}
|
||||
All bounding boxes predicted by the RoI head or RPN are estimated as offsets
|
||||
with respect to a reference bounding box. In the case of the RPN,
|
||||
@ -274,10 +345,10 @@ predicted relative to the RPN output bounding boxes.
|
||||
Let $(x, y, w, h)$ be the top left coordinates, height and width of the bounding box
|
||||
to be predicted. Likewise, let $(x^*, y^*, w^*, h^*)$ be the ground truth bounding
|
||||
box and let $(x_r, y_r, w_r, h_r)$ be the reference bounding box.
|
||||
We then define the ground truth \emph{box encoding} $b^*$ as
|
||||
\begin{equation*}
|
||||
b^* = (b_x^*, b_y^*, b_w^*, b_h^*),
|
||||
\end{equation*}
|
||||
We then define the ground truth \emph{box encoding} $b_e^*$ as
|
||||
\begin{equation}
|
||||
b_e^* = (b_x^*, b_y^*, b_w^*, b_h^*),
|
||||
\end{equation}
|
||||
where
|
||||
\begin{equation*}
|
||||
b_x^* = \frac{x^* - x_r}{w_r},
|
||||
@ -294,10 +365,10 @@ b_h^* = \log \left( \frac{h^*}{h_r} \right),
|
||||
which represents the regression target for the bounding box refinement
|
||||
outputs of the network.
|
||||
|
||||
In the same way, we define the predicted box encoding $b$ as
|
||||
\begin{equation*}
|
||||
(b_x, b_y, b_w, b_h),
|
||||
\end{equation*}
|
||||
In the same way, we define the predicted box encoding $b_e$ as
|
||||
\begin{equation}
|
||||
b_e = (b_x, b_y, b_w, b_h),
|
||||
\end{equation}
|
||||
where
|
||||
\begin{equation*}
|
||||
b_x = \frac{x - x_r}{w_r},
|
||||
@ -312,8 +383,13 @@ b_w = \log \left( \frac{w}{w_r} \right)
|
||||
b_h = \log \left( \frac{h}{h_r} \right).
|
||||
\end{equation*}
|
||||
|
||||
At test time, to get from a predicted box encoding $(b_x, b_y, b_w, b_h)$ to the actual bounding box $(x, y, w, h)$,
|
||||
At test time, to get from a predicted box encoding $b_e$ to the predicted bounding box $b$,
|
||||
we invert the definitions above,
|
||||
\begin{equation}
|
||||
b = (x, y, w, h),
|
||||
\label{eq:pred_bounding_box}
|
||||
\end{equation}
|
||||
where
|
||||
\begin{equation*}
|
||||
x = b_x \cdot w_r + x_r,
|
||||
\end{equation*}
|
||||
|
||||
7
bib.bib
7
bib.bib
@ -230,3 +230,10 @@
|
||||
title = {Backpropagation applied to handwritten zip code recognition},
|
||||
booktitle = {Neural Computation},
|
||||
year = {1989}}
|
||||
|
||||
@inproceedings{GCNet,
|
||||
author = {Alex Kendall and Hayk Martirosyan and Saumitro Dasgupta and Peter Henry
|
||||
Ryan Kennedy and Abraham Bachrach and Adam Bry},
|
||||
title = {End-to-End Learning of Geometry and Context for Deep Stereo Regression},
|
||||
booktitle = {CVPR},
|
||||
year = {2017}}
|
||||
|
||||
@ -24,13 +24,49 @@ However, in many applications settings, we are not provided with any depth infor
|
||||
In most cases, we want to work with raw RGB sequences from one or multiple simple cameras,
|
||||
from which no depth data is available.
|
||||
To do so, we could integrate depth prediction into our network by branching off a
|
||||
depth network from the backbone in parallel to the RPN, as in Figure \ref{}.
|
||||
depth network from the backbone in parallel to the RPN (Figure \ref{table:motionrcnn_resnet_fpn_depth}).
|
||||
Alternatively, we could add a specialized network for end-to-end depth regression
|
||||
in parallel to the region-based network, e.g. \cite{GCNet}.
|
||||
Although single-frame monocular depth prediction with deep networks was already done
|
||||
to some level of success,
|
||||
our two-frame input should allow the network to make use of epipolar
|
||||
geometry for making a more reliable depth estimate, at least when the camera
|
||||
is moving.
|
||||
|
||||
{
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\begin{tabular}{llr}
|
||||
\toprule
|
||||
\textbf{Layer ID} & \textbf{Layer Operations} & \textbf{Output Dimensions} \\
|
||||
\midrule\midrule
|
||||
& input image & H $\times$ W $\times$ C \\
|
||||
\midrule
|
||||
C$_5$ & ResNet-50 (Table \ref{table:resnet}) & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 1024 \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{RPN \& FPN} (Table \ref{table:maskrcnn_resnet_fpn})} \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{Depth Network}}\\
|
||||
\midrule
|
||||
& From P$_2$: 3 $\times$ 3 conv, 1024 & $\tfrac{1}{4}$ H $\times$ $\tfrac{1}{4}$ W $\times$ 256 \\
|
||||
& 1 $\times$ 1 conv, 1 & $\tfrac{1}{4}$ H $\times$ $\tfrac{1}{4}$ W $\times$ 1 \\
|
||||
& $\times$ 2 bilinear upsample & H $\times$ W $\times$ 1 \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{Camera Motion Network} (Table \ref{table:motionrcnn_resnet_fpn})}\\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{RoI Head \& RoI Head: Masks} (Table \ref{table:maskrcnn_resnet_fpn})} \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{RoI Head: Motions} (Table \ref{table:motionrcnn_resnet_fpn})}\\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
|
||||
\caption {
|
||||
Preliminary Motion R-CNN ResNet-50-FPN architecture with depth prediction,
|
||||
based on the Mask R-CNN ResNet-50-FPN architecture (Table \ref{table:maskrcnn_resnet_fpn}).
|
||||
}
|
||||
\label{table:motionrcnn_resnet_fpn_depth}
|
||||
\end{table}
|
||||
}
|
||||
\paragraph{Training on real world data}
|
||||
Due to the amount of supervision required by the different components of the network
|
||||
and the complexity of the optimization problem,
|
||||
|
||||
@ -167,6 +167,13 @@ The flow error map depicts correct estimates ($\leq 3$ px or $\leq 5\%$ error) i
|
||||
FPN & cam. & sup. & XYZ & $E_{R} [deg]$ & $E_{t} [m]$ & $E_{p} [m] $ & $E_{R}^{cam} [deg]$ & $E_{t}^{cam} [m]$ & AEE & Fl-all \\\midrule
|
||||
$\times$ & \checkmark & 3D & \checkmark & 0.4 & 0.49 & 17.06 & 0.1 & 0.04 & 6.73 & 26.59\% \\
|
||||
\checkmark & \checkmark & 3D & \checkmark & 0.35 & 0.38 & 11.87 & 0.22 & 0.07 & 12.62 & 46.28\% \\
|
||||
$\times$ & $\times$ & 3D & \checkmark & ? & ? & ? & - & - & ? & ? \% \\
|
||||
\checkmark & $\times$ & 3D & \checkmark & ? & ? & ? & - & - & ? & ? \% \\
|
||||
\midrule
|
||||
$\times$ & \checkmark & flow & \checkmark & ? & ? & ? & ? & ? & ? & ? \% \\
|
||||
\checkmark & \checkmark & flow & \checkmark & ? & ? & ? & ? & ? & ? & ? \% \\
|
||||
$\times$ & $\times$ & flow & \checkmark & ? & ? & ? & - & - & ? & ? \% \\
|
||||
\checkmark & $\times$ & flow & \checkmark & ? & ? & ? & - & - & ? & ? \% \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
|
||||
@ -175,13 +182,13 @@ Comparison of network variants on the Virtual KITTI validation set.
|
||||
AEE: Average Endpoint Error; Fl-all: Ratio of pixels where flow estimate is
|
||||
wrong by both $\geq 3$ pixels and $\geq 5\%$.
|
||||
Camera and instance motion errors are averaged over the validation set.
|
||||
We optionally train camera motion prediction (cam.),
|
||||
We optionally enable camera motion prediction (cam.),
|
||||
replace the ResNet-50 backbone with ResNet-50-FPN (FPN),
|
||||
or input XYZ coordinates into the backbone (XYZ).
|
||||
We either supervise
|
||||
object motions (sup.) with 3D motion ground truth (3D) or
|
||||
with a 2D re-projection loss based on flow ground truth (flow).
|
||||
Note that for variants where no camera motion is trained and predicted, the optical flow
|
||||
Note that for rows where no camera motion is predicted, the optical flow
|
||||
is composed using the ground truth camera motion and thus the flow error is
|
||||
only impacted by the predicted 3D object motions.
|
||||
}
|
||||
|
||||
BIN
figures/fpn.png
Normal file
BIN
figures/fpn.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 97 KiB |
BIN
figures/maskrcnn_cs.png
Normal file
BIN
figures/maskrcnn_cs.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 1.1 MiB |
BIN
figures/sfmnet_kitti.png
Normal file
BIN
figures/sfmnet_kitti.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 868 KiB |
@ -42,15 +42,34 @@ and dense optical flow in monocular image sequences based on estimating the 3D m
|
||||
SfM-Net predicts a batch of binary full image masks specyfing the object memberships of individual pixels with a standard encoder-decoder
|
||||
network for pixel-wise prediction. A fully connected network branching off the encoder predicts a 3D motion for each object.
|
||||
However, due to the fixed number of objects masks, the system can only predict a small number of motions and
|
||||
often fails to properly segment the pixels into the correct masks or assigns background pixels to object motions.
|
||||
|
||||
often fails to properly segment the pixels into the correct masks or assigns background pixels to object motions (Figure \ref{figure:sfmnet_kitti}).
|
||||
\begin{figure}[t]
|
||||
\centering
|
||||
\includegraphics[width=\textwidth]{figures/sfmnet_kitti}
|
||||
\caption{
|
||||
Results of SfM-Net \cite{MaskRCNN} on KITTI \cite{KITTI2015}.
|
||||
From left to right we show, instance segmentation into up to 3 independent objects,
|
||||
ground truth instance masks for the segmented objects, composed optical flow and ground truth optical flow.
|
||||
}
|
||||
\label{figure:sfmnet_kitti}
|
||||
\end{figure}
|
||||
Thus, this approach is very unlikely to scale to dynamic scenes with a potentially
|
||||
large number of diverse objects due to the inflexible nature of their instance segmentation technique.
|
||||
|
||||
A scalable approach to instance segmentation based on region-based convolutional networks
|
||||
was recently introduced with Mask R-CNN \cite{MaskRCNN}, which inherits the ability to detect
|
||||
a large number of objects from a large number of classes at once from Faster R-CNN
|
||||
and predicts pixel-precise segmentation masks for each detected object.
|
||||
and predicts pixel-precise segmentation masks for each detected object (Figure \ref{figure:maskrcnn_cs}).
|
||||
|
||||
\begin{figure}[t]
|
||||
\centering
|
||||
\includegraphics[width=\textwidth]{figures/maskrcnn_cs}
|
||||
\caption{
|
||||
Instance segmentation results of Mask R-CNN ResNet-50-FPN \cite{MaskRCNN}
|
||||
on Cityscapes \cite{Cityscapes}.
|
||||
}
|
||||
\label{figure:maskrcnn_cs}
|
||||
\end{figure}
|
||||
|
||||
We propose \emph{Motion R-CNN}, which combines the scalable instance segmentation capabilities of
|
||||
Mask R-CNN with the end-to-end 3D motion estimation approach introduced with SfM-Net.
|
||||
|
||||
@ -83,7 +83,6 @@
|
||||
|
||||
|
||||
\newcommand{\todo}[1]{\textbf{\textcolor{red}{#1}}}
|
||||
\setlength{\belowrulesep}{0pt}
|
||||
|
||||
% Titelei
|
||||
\author{\myname}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user