mirror of
https://github.com/tu-darmstadt-informatik/bsc-thesis.git
synced 2025-12-12 17:35:51 +00:00
WIP
This commit is contained in:
parent
3c294bddc8
commit
c1efb75b1a
57
approach.tex
57
approach.tex
@ -26,7 +26,6 @@ object-centric framework of a region based convolutional network head with a 3D
|
||||
Thus, in contrast to the dense FlowNet decoder, the estimated dense motion information
|
||||
from the encoder is integrated for specific objects via RoI cropping and
|
||||
processed by the RoI head for each object.
|
||||
\todo{figure of backbone}
|
||||
|
||||
\paragraph{Per-RoI motion prediction}
|
||||
We use a rigid 3D motion parametrization similar to the one used in SfM-Net and SE3-Nets \cite{SfmNet,SE3Nets}.
|
||||
@ -90,9 +89,55 @@ between the two frames $I_t$ and $I_{t+1}$.
|
||||
For this, we flatten the bottleneck output of the backbone and pass it through a fully connected layer.
|
||||
We again represent $R_t^{cam}$ using a Euler angle representation and
|
||||
predict $\sin(\alpha)$, $\sin(\beta)$, $\sin(\gamma)$ and $t_t^{cam}$ in the same way as for the individual objects.
|
||||
Again, we predict a softmax score $o_t^k$ for classifying differentiating between
|
||||
Again, we predict a softmax score $o_t^{cam}$ for classifying differentiating between
|
||||
a still and moving camera.
|
||||
|
||||
{
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\begin{tabular}{llr}
|
||||
layer id & layer operations & output dimensions \\
|
||||
\toprule \\
|
||||
& input image & H $\times$ W $\times$ C \\
|
||||
\midrule \\
|
||||
C$_4$ & \textbf{ResNet-50} [up to C$_4$] & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 1024 \\
|
||||
\midrule \\
|
||||
\multicolumn{3}{c}{\textbf{Region Proposal Network (RPN)} (see Table \ref{table:maskrcnn_resnet})}\\
|
||||
\midrule \\
|
||||
\multicolumn{3}{c}{\textbf{Camera Motion Network}}\\
|
||||
\midrule \\
|
||||
& From C$_4$: \textbf{ResNet-50} [C$_5$ without stride] & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 2048 \\
|
||||
& average pool & 1 $\times$ 2048 \\
|
||||
& fully connected, 1024 & 1 $\times$ 1024 \\
|
||||
M$_1$ & fully connected, 1024 & 1 $\times$ 1024 \\
|
||||
$R_t^{cam}$& From M$_1$: fully connected, 3 & 1 $\times$ 3 \\
|
||||
$t_t^{cam}$& From M$_1$: fully connected, 3 & 1 $\times$ 3 \\
|
||||
$o_t^{cam}$& From M$_1$: fully connected, 2 & 1 $\times$ 2 \\
|
||||
\midrule \\
|
||||
\multicolumn{3}{c}{\textbf{RoI Head} (see Table \ref{table:maskrcnn_resnet})}\\
|
||||
\midrule \\
|
||||
\multicolumn{3}{c}{\textbf{RoI Head: Masks} (see Table \ref{table:maskrcnn_resnet})}\\
|
||||
\midrule \\
|
||||
\multicolumn{3}{c}{\textbf{RoI Head: Motions}}\\
|
||||
\midrule \\
|
||||
& From ave: fully connected, 1024 & N$_{RPN}$ $\times$ 1024 \\
|
||||
M$_2$ & fully connected, 1024 & N$_{RPN}$ $\times$ 1024 \\
|
||||
$\forall k: R_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
|
||||
$\forall k: t_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
|
||||
$\forall k: p_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
|
||||
$\forall k: o_t^k$ & From M$_2$: fully connected, 2 & N$_{RPN}$ $\times$ 2 \\
|
||||
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
|
||||
\caption {
|
||||
Motion R-CNN ResNet architecture based on the Mask R-CNN
|
||||
ResNet architecture (Table \ref{table:maskrcnn_resnet}).
|
||||
}
|
||||
\label{table:motion_rcnn_resnet}
|
||||
\end{table}
|
||||
}
|
||||
|
||||
\subsection{Supervision}
|
||||
\label{ssec:supervision}
|
||||
|
||||
@ -108,8 +153,8 @@ Similar to the camera pose regression loss in \cite{PoseNet2},
|
||||
we use a variant of the $\ell_1$-loss to penalize the differences between ground truth and predicted
|
||||
rotation, translation (and pivot, in our case). We found that the smooth $\ell_1$-loss
|
||||
performs better in our case than the standard $\ell_1$-loss.
|
||||
For each RoI, we compute the motion loss $L_{motion}^k$ as a linear sum of
|
||||
the individual losses,
|
||||
For each RoI, we compute the total motion loss $L_{motion}^k$ from
|
||||
the individual loss terms as,
|
||||
|
||||
\begin{equation}
|
||||
L_{motion}^k = l_{p}^k + (l_{R}^k + l_{t}^k) \cdot o^{gt,i_k} + l_o^k,
|
||||
@ -124,7 +169,7 @@ l_{t}^k = \ell_1^* (t^{gt,i_k} - t^{k,c_k}),
|
||||
\begin{equation}
|
||||
l_{p}^k = \ell_1^* (p^{gt,i_k} - p^{k,c_k}).
|
||||
\end{equation}
|
||||
are the smooth $\ell_1$-losses for the predicted rotation, translation and pivot,
|
||||
are the smooth $\ell_1$-loss terms for the predicted rotation, translation and pivot,
|
||||
respectively and
|
||||
\begin{equation}
|
||||
l_o^k = \ell_{cls}(o_t^k, o^{gt,i_k}).
|
||||
@ -226,7 +271,7 @@ Next, we transform all points given the camera transformation $\{R_t^c, t_t^c\}
|
||||
\begin{pmatrix}
|
||||
X_{t+1} \\ Y_{t+1} \\ Z_{t+1}
|
||||
\end{pmatrix}
|
||||
= P_{t+1} = R_t^c \cdot P'_{t+1} + t_t^k
|
||||
= P_{t+1} = R_t^c \cdot P'_{t+1} + t_t^c
|
||||
\end{equation}.
|
||||
|
||||
Note that in our experiments, we either use the ground truth camera motion to focus
|
||||
|
||||
130
background.tex
130
background.tex
@ -48,7 +48,7 @@ performing upsampling of the compressed features and resulting in a encoder-deco
|
||||
The most popular deep networks of this kind for end-to-end optical flow prediction
|
||||
are variants of the FlowNet family \cite{FlowNet, FlowNet2},
|
||||
which was recently extended to scene flow estimation \cite{SceneFlowDataset}.
|
||||
Figure \ref{} shows the classical FlowNetS architecture for optical fow prediction.
|
||||
Table \ref{} shows the classical FlowNetS architecture for optical fow prediction.
|
||||
Note that the network itself is a rather generic autoencoder and is specialized for optical flow only through being trained
|
||||
with supervision from dense optical flow ground truth.
|
||||
Potentially, the same network could also be used for semantic segmentation if
|
||||
@ -60,17 +60,94 @@ operations in the encoder.
|
||||
Recently, other encoder-decoder CNNs have been applied to optical flow as well \cite{DenseNetDenseFlow}.
|
||||
|
||||
\subsection{SfM-Net}
|
||||
Here, we will describe the SfM-Net architecture in more detail and show their results
|
||||
Here, we will describe the SfM-Net \cite{SfmNet} architecture in more detail and show their results
|
||||
and some of the issues.
|
||||
|
||||
\subsection{ResNet}
|
||||
\label{ssec:resnet}
|
||||
For completeness, we will give a short review of the ResNet \cite{ResNet} architecture we will use
|
||||
as a backbone CNN for our network.
|
||||
ResNet \cite{ResNet} was initially introduced as a CNN for image classification, but
|
||||
became popular as basic building block of many deep network architectures for a variety
|
||||
of different tasks. In Table \ref{table:resnet}, we show the ResNet-50 variant
|
||||
that will serve as the basic CNN backbone of our networks, and
|
||||
is also used in many other region-based convolutional networks.
|
||||
The initial image data is always passed through ResNet-50 as a first step to
|
||||
bootstrap the complete deep network.
|
||||
Figure \ref{figure:bottleneck}
|
||||
shows the fundamental building block of ResNet-50.
|
||||
|
||||
{
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\begin{tabular}{llr}
|
||||
layer id & layer operations & output dimensions \\
|
||||
\toprule \\
|
||||
& input image & H $\times$ W $\times$ C \\
|
||||
\midrule \\
|
||||
\multicolumn{3}{c}{\textbf{ResNet-50}}\\
|
||||
\midrule \\
|
||||
C$_1$ & 7 $\times$ 7 conv, 64, stride 2 & $\tfrac{1}{2}$ H $\times$ $\tfrac{1}{2}$ W $\times$ 64 \\
|
||||
|
||||
& 3 $\times$ 3 max pool, stride 2 & $\tfrac{1}{4}$ H $\times$ $\tfrac{1}{4}$ W $\times$ 64 \\
|
||||
|
||||
C$_2$ &
|
||||
$\begin{bmatrix}
|
||||
1 \times 1, 64 \\
|
||||
3 \times 3, 64 \\
|
||||
1 \times 1, 256 \\
|
||||
\end{bmatrix}_b$ $\times$ 3
|
||||
& $\tfrac{1}{4}$ H $\times$ $\tfrac{1}{4}$ W $\times$ 256 \\
|
||||
\midrule \\
|
||||
C$_3$ &
|
||||
$\begin{bmatrix}
|
||||
1 \times 1, 128 \\
|
||||
3 \times 3, 128 \\
|
||||
1 \times 1, 512 \\
|
||||
\end{bmatrix}_{b/2}$ $\times$ 4
|
||||
& $\tfrac{1}{8}$ H $\times$ $\tfrac{1}{8}$ W $\times$ 512 \\
|
||||
\midrule \\
|
||||
C$_4$ &
|
||||
$\begin{bmatrix}
|
||||
1 \times 1, 256 \\
|
||||
3 \times 3, 256 \\
|
||||
1 \times 1, 1024 \\
|
||||
\end{bmatrix}_{b/2}$ $\times$ 6
|
||||
& $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 1024 \\
|
||||
\midrule \\
|
||||
C$_5$ &
|
||||
$\begin{bmatrix}
|
||||
1 \times 1, 512 \\
|
||||
3 \times 3, 512 \\
|
||||
1 \times 1, 2048 \\
|
||||
\end{bmatrix}_{b/2}$ $\times$ 3
|
||||
& $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 1024 \\
|
||||
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
|
||||
\caption {
|
||||
ResNet-50 \cite{ResNet} architecture.
|
||||
Operations enclosed in a []$_b$ block make up a single ResNet \enquote{bottleneck}
|
||||
block (see Figure \ref{figure:bottleneck}). If the block is denoted as []$_b/2$,
|
||||
the first conv operation in the block has a stride of 2. Note that the stride
|
||||
is only applied to the first block, but not to repeated blocks.
|
||||
}
|
||||
\label{table:resnet}
|
||||
\end{table}
|
||||
}
|
||||
|
||||
\begin{figure}[t]
|
||||
\centering
|
||||
\includegraphics[width=0.3\textwidth]{figures/bottleneck}
|
||||
\caption{
|
||||
ResNet \cite{ResNet} \enquote{bottleneck} block introduced to reduce computational
|
||||
complexity in deeper network variants, shown here with 256 input and output channels.
|
||||
}
|
||||
\label{figure:bottleneck}
|
||||
\end{figure}
|
||||
|
||||
\subsection{Region-based convolutional networks}
|
||||
\label{ssec:rcnn}
|
||||
We now give a short review of region-based convolutional networks, which are currently by far the
|
||||
We now give an overview of region-based convolutional networks, which are currently by far the
|
||||
most popular deep networks for object detection, and have recently also been applied to instance segmentation.
|
||||
|
||||
\paragraph{R-CNN}
|
||||
@ -146,6 +223,49 @@ variant based on Feature Pyramid Networks \cite{FPN}.
|
||||
Figure \ref{} compares the two Mask R-CNN head variants.
|
||||
\todo{RoI Align}
|
||||
|
||||
{
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\begin{tabular}{llr}
|
||||
layer id & layer operations & output dimensions \\
|
||||
\toprule \\
|
||||
& input image & H $\times$ W $\times$ C \\
|
||||
\midrule \\
|
||||
C$_4$ & \textbf{ResNet-50} [up to C$_4$] & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 1024 \\
|
||||
\midrule \\
|
||||
\multicolumn{3}{c}{\textbf{Region Proposal Network (RPN)}}\\
|
||||
\midrule \\
|
||||
& From C$_4$: 1 $\times$ 1 conv, 512 & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 512 \\
|
||||
& 1 $\times$ 1 conv, 4 & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 4 \\
|
||||
& flatten & A $\times$ 4 \\
|
||||
& decode bounding boxes \ref{} & A $\times$ 4 \\
|
||||
boxes$_{\mathrm{RPN}}$ & sample bounding boxes \ref{} & N$_{RPN}$ $\times$ 4 \\
|
||||
\midrule \\
|
||||
\multicolumn{3}{c}{\textbf{RoI Head}}\\
|
||||
\midrule \\
|
||||
& From C$_4$ with boxes$_{\mathrm{RPN}}$: RoI pooling \ref{} & N$_{RPN}$ $\times$ 7 $\times$ 7 $\times$ 1024 \\
|
||||
R$_1$& \textbf{ResNet-50} [C$_5$ without stride] & N$_{RPN}$ $\times$ 7 $\times$ 7 $\times$ 2048 \\
|
||||
ave & average pool & N$_{RPN}$ $\times$ 2048 \\
|
||||
boxes& From ave: fully connected, 4 & N$_{RPN}$ $\times$ 4 \\
|
||||
logits& From ave: fully connected, N$_{cls}$ & N$_{RPN}$ $\times$ N$_{cls}$ \\
|
||||
\midrule \\
|
||||
\multicolumn{3}{c}{\textbf{RoI Head: Masks}}\\
|
||||
\midrule \\
|
||||
& From R$_1$: 2 $\times$ 2 deconv, 256, stride 2 & N$_{RPN}$ $\times$ 14 $\times$ 14 $\times$ 256 \\
|
||||
masks & 1 $\times$ 1 conv, N$_{cls}$ & N$_{RPN}$ $\times$ 14 $\times$ 14 $\times$ N$_{cls}$ \\
|
||||
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
|
||||
\caption {
|
||||
Mask R-CNN \cite{MaskRCNN} ResNet \cite{ResNet} architecture.
|
||||
Note that this is equivalent to the Faster R-CNN architecture if the mask
|
||||
head is left out.
|
||||
}
|
||||
\label{table:maskrcnn_resnet}
|
||||
\end{table}
|
||||
}
|
||||
|
||||
\paragraph{Bounding box regression}
|
||||
All bounding boxes predicted by the RoI head or RPN are estimated as offsets
|
||||
with respect to a reference bounding box. In the case of the RPN,
|
||||
|
||||
@ -176,7 +176,7 @@ AEE: Average Endpoint Error; Fl-all: Ratio of pixels where flow estimate is
|
||||
wrong by both $\geq 3$ pixels and $\geq 5\%$.
|
||||
Camera and instance motion errors are averaged over the validation set.
|
||||
We optionally train camera motion prediction (cam.),
|
||||
replace the ResNet50 backbone with ResNet50-FPN (FPN),
|
||||
replace the ResNet-50 backbone with ResNet-50-FPN (FPN),
|
||||
or input XYZ coordinates into the backbone (XYZ).
|
||||
We either supervise
|
||||
object motions (sup.) with 3D motion ground truth (3D) or
|
||||
|
||||
BIN
figures/bottleneck.png
Normal file
BIN
figures/bottleneck.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 30 KiB |
@ -28,6 +28,7 @@
|
||||
\usepackage{lipsum} % zur Erzeugung von Lorem-Ipsum-Blindtext
|
||||
\usepackage[math]{blindtext} % zur Erzeugung von deutschem Blindtext
|
||||
\usepackage{hyperref} % Verlinkungen im Dokument
|
||||
\usepackage{csquotes}
|
||||
|
||||
% INFO %
|
||||
% Das hyperref-Paket sollte möglichst als letztes geladen werden, darum steht es weiter hinten in der Präambel.
|
||||
@ -82,7 +83,7 @@
|
||||
|
||||
|
||||
\newcommand{\todo}[1]{\textbf{\textcolor{red}{#1}}}
|
||||
|
||||
\setlength{\belowrulesep}{0pt}
|
||||
|
||||
% Titelei
|
||||
\author{\myname}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user