mirror of
https://github.com/tu-darmstadt-informatik/bsc-thesis.git
synced 2025-12-13 01:45:50 +00:00
WIP
This commit is contained in:
parent
3c294bddc8
commit
c1efb75b1a
57
approach.tex
57
approach.tex
@ -26,7 +26,6 @@ object-centric framework of a region based convolutional network head with a 3D
|
|||||||
Thus, in contrast to the dense FlowNet decoder, the estimated dense motion information
|
Thus, in contrast to the dense FlowNet decoder, the estimated dense motion information
|
||||||
from the encoder is integrated for specific objects via RoI cropping and
|
from the encoder is integrated for specific objects via RoI cropping and
|
||||||
processed by the RoI head for each object.
|
processed by the RoI head for each object.
|
||||||
\todo{figure of backbone}
|
|
||||||
|
|
||||||
\paragraph{Per-RoI motion prediction}
|
\paragraph{Per-RoI motion prediction}
|
||||||
We use a rigid 3D motion parametrization similar to the one used in SfM-Net and SE3-Nets \cite{SfmNet,SE3Nets}.
|
We use a rigid 3D motion parametrization similar to the one used in SfM-Net and SE3-Nets \cite{SfmNet,SE3Nets}.
|
||||||
@ -90,9 +89,55 @@ between the two frames $I_t$ and $I_{t+1}$.
|
|||||||
For this, we flatten the bottleneck output of the backbone and pass it through a fully connected layer.
|
For this, we flatten the bottleneck output of the backbone and pass it through a fully connected layer.
|
||||||
We again represent $R_t^{cam}$ using a Euler angle representation and
|
We again represent $R_t^{cam}$ using a Euler angle representation and
|
||||||
predict $\sin(\alpha)$, $\sin(\beta)$, $\sin(\gamma)$ and $t_t^{cam}$ in the same way as for the individual objects.
|
predict $\sin(\alpha)$, $\sin(\beta)$, $\sin(\gamma)$ and $t_t^{cam}$ in the same way as for the individual objects.
|
||||||
Again, we predict a softmax score $o_t^k$ for classifying differentiating between
|
Again, we predict a softmax score $o_t^{cam}$ for classifying differentiating between
|
||||||
a still and moving camera.
|
a still and moving camera.
|
||||||
|
|
||||||
|
{
|
||||||
|
\begin{table}[h]
|
||||||
|
\centering
|
||||||
|
\begin{tabular}{llr}
|
||||||
|
layer id & layer operations & output dimensions \\
|
||||||
|
\toprule \\
|
||||||
|
& input image & H $\times$ W $\times$ C \\
|
||||||
|
\midrule \\
|
||||||
|
C$_4$ & \textbf{ResNet-50} [up to C$_4$] & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 1024 \\
|
||||||
|
\midrule \\
|
||||||
|
\multicolumn{3}{c}{\textbf{Region Proposal Network (RPN)} (see Table \ref{table:maskrcnn_resnet})}\\
|
||||||
|
\midrule \\
|
||||||
|
\multicolumn{3}{c}{\textbf{Camera Motion Network}}\\
|
||||||
|
\midrule \\
|
||||||
|
& From C$_4$: \textbf{ResNet-50} [C$_5$ without stride] & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 2048 \\
|
||||||
|
& average pool & 1 $\times$ 2048 \\
|
||||||
|
& fully connected, 1024 & 1 $\times$ 1024 \\
|
||||||
|
M$_1$ & fully connected, 1024 & 1 $\times$ 1024 \\
|
||||||
|
$R_t^{cam}$& From M$_1$: fully connected, 3 & 1 $\times$ 3 \\
|
||||||
|
$t_t^{cam}$& From M$_1$: fully connected, 3 & 1 $\times$ 3 \\
|
||||||
|
$o_t^{cam}$& From M$_1$: fully connected, 2 & 1 $\times$ 2 \\
|
||||||
|
\midrule \\
|
||||||
|
\multicolumn{3}{c}{\textbf{RoI Head} (see Table \ref{table:maskrcnn_resnet})}\\
|
||||||
|
\midrule \\
|
||||||
|
\multicolumn{3}{c}{\textbf{RoI Head: Masks} (see Table \ref{table:maskrcnn_resnet})}\\
|
||||||
|
\midrule \\
|
||||||
|
\multicolumn{3}{c}{\textbf{RoI Head: Motions}}\\
|
||||||
|
\midrule \\
|
||||||
|
& From ave: fully connected, 1024 & N$_{RPN}$ $\times$ 1024 \\
|
||||||
|
M$_2$ & fully connected, 1024 & N$_{RPN}$ $\times$ 1024 \\
|
||||||
|
$\forall k: R_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
|
||||||
|
$\forall k: t_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
|
||||||
|
$\forall k: p_t^k$ & From M$_2$: fully connected, 3 & N$_{RPN}$ $\times$ 3 \\
|
||||||
|
$\forall k: o_t^k$ & From M$_2$: fully connected, 2 & N$_{RPN}$ $\times$ 2 \\
|
||||||
|
|
||||||
|
\bottomrule
|
||||||
|
\end{tabular}
|
||||||
|
|
||||||
|
\caption {
|
||||||
|
Motion R-CNN ResNet architecture based on the Mask R-CNN
|
||||||
|
ResNet architecture (Table \ref{table:maskrcnn_resnet}).
|
||||||
|
}
|
||||||
|
\label{table:motion_rcnn_resnet}
|
||||||
|
\end{table}
|
||||||
|
}
|
||||||
|
|
||||||
\subsection{Supervision}
|
\subsection{Supervision}
|
||||||
\label{ssec:supervision}
|
\label{ssec:supervision}
|
||||||
|
|
||||||
@ -108,8 +153,8 @@ Similar to the camera pose regression loss in \cite{PoseNet2},
|
|||||||
we use a variant of the $\ell_1$-loss to penalize the differences between ground truth and predicted
|
we use a variant of the $\ell_1$-loss to penalize the differences between ground truth and predicted
|
||||||
rotation, translation (and pivot, in our case). We found that the smooth $\ell_1$-loss
|
rotation, translation (and pivot, in our case). We found that the smooth $\ell_1$-loss
|
||||||
performs better in our case than the standard $\ell_1$-loss.
|
performs better in our case than the standard $\ell_1$-loss.
|
||||||
For each RoI, we compute the motion loss $L_{motion}^k$ as a linear sum of
|
For each RoI, we compute the total motion loss $L_{motion}^k$ from
|
||||||
the individual losses,
|
the individual loss terms as,
|
||||||
|
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
L_{motion}^k = l_{p}^k + (l_{R}^k + l_{t}^k) \cdot o^{gt,i_k} + l_o^k,
|
L_{motion}^k = l_{p}^k + (l_{R}^k + l_{t}^k) \cdot o^{gt,i_k} + l_o^k,
|
||||||
@ -124,7 +169,7 @@ l_{t}^k = \ell_1^* (t^{gt,i_k} - t^{k,c_k}),
|
|||||||
\begin{equation}
|
\begin{equation}
|
||||||
l_{p}^k = \ell_1^* (p^{gt,i_k} - p^{k,c_k}).
|
l_{p}^k = \ell_1^* (p^{gt,i_k} - p^{k,c_k}).
|
||||||
\end{equation}
|
\end{equation}
|
||||||
are the smooth $\ell_1$-losses for the predicted rotation, translation and pivot,
|
are the smooth $\ell_1$-loss terms for the predicted rotation, translation and pivot,
|
||||||
respectively and
|
respectively and
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
l_o^k = \ell_{cls}(o_t^k, o^{gt,i_k}).
|
l_o^k = \ell_{cls}(o_t^k, o^{gt,i_k}).
|
||||||
@ -226,7 +271,7 @@ Next, we transform all points given the camera transformation $\{R_t^c, t_t^c\}
|
|||||||
\begin{pmatrix}
|
\begin{pmatrix}
|
||||||
X_{t+1} \\ Y_{t+1} \\ Z_{t+1}
|
X_{t+1} \\ Y_{t+1} \\ Z_{t+1}
|
||||||
\end{pmatrix}
|
\end{pmatrix}
|
||||||
= P_{t+1} = R_t^c \cdot P'_{t+1} + t_t^k
|
= P_{t+1} = R_t^c \cdot P'_{t+1} + t_t^c
|
||||||
\end{equation}.
|
\end{equation}.
|
||||||
|
|
||||||
Note that in our experiments, we either use the ground truth camera motion to focus
|
Note that in our experiments, we either use the ground truth camera motion to focus
|
||||||
|
|||||||
130
background.tex
130
background.tex
@ -48,7 +48,7 @@ performing upsampling of the compressed features and resulting in a encoder-deco
|
|||||||
The most popular deep networks of this kind for end-to-end optical flow prediction
|
The most popular deep networks of this kind for end-to-end optical flow prediction
|
||||||
are variants of the FlowNet family \cite{FlowNet, FlowNet2},
|
are variants of the FlowNet family \cite{FlowNet, FlowNet2},
|
||||||
which was recently extended to scene flow estimation \cite{SceneFlowDataset}.
|
which was recently extended to scene flow estimation \cite{SceneFlowDataset}.
|
||||||
Figure \ref{} shows the classical FlowNetS architecture for optical fow prediction.
|
Table \ref{} shows the classical FlowNetS architecture for optical fow prediction.
|
||||||
Note that the network itself is a rather generic autoencoder and is specialized for optical flow only through being trained
|
Note that the network itself is a rather generic autoencoder and is specialized for optical flow only through being trained
|
||||||
with supervision from dense optical flow ground truth.
|
with supervision from dense optical flow ground truth.
|
||||||
Potentially, the same network could also be used for semantic segmentation if
|
Potentially, the same network could also be used for semantic segmentation if
|
||||||
@ -60,17 +60,94 @@ operations in the encoder.
|
|||||||
Recently, other encoder-decoder CNNs have been applied to optical flow as well \cite{DenseNetDenseFlow}.
|
Recently, other encoder-decoder CNNs have been applied to optical flow as well \cite{DenseNetDenseFlow}.
|
||||||
|
|
||||||
\subsection{SfM-Net}
|
\subsection{SfM-Net}
|
||||||
Here, we will describe the SfM-Net architecture in more detail and show their results
|
Here, we will describe the SfM-Net \cite{SfmNet} architecture in more detail and show their results
|
||||||
and some of the issues.
|
and some of the issues.
|
||||||
|
|
||||||
\subsection{ResNet}
|
\subsection{ResNet}
|
||||||
\label{ssec:resnet}
|
\label{ssec:resnet}
|
||||||
For completeness, we will give a short review of the ResNet \cite{ResNet} architecture we will use
|
ResNet \cite{ResNet} was initially introduced as a CNN for image classification, but
|
||||||
as a backbone CNN for our network.
|
became popular as basic building block of many deep network architectures for a variety
|
||||||
|
of different tasks. In Table \ref{table:resnet}, we show the ResNet-50 variant
|
||||||
|
that will serve as the basic CNN backbone of our networks, and
|
||||||
|
is also used in many other region-based convolutional networks.
|
||||||
|
The initial image data is always passed through ResNet-50 as a first step to
|
||||||
|
bootstrap the complete deep network.
|
||||||
|
Figure \ref{figure:bottleneck}
|
||||||
|
shows the fundamental building block of ResNet-50.
|
||||||
|
|
||||||
|
{
|
||||||
|
\begin{table}[h]
|
||||||
|
\centering
|
||||||
|
\begin{tabular}{llr}
|
||||||
|
layer id & layer operations & output dimensions \\
|
||||||
|
\toprule \\
|
||||||
|
& input image & H $\times$ W $\times$ C \\
|
||||||
|
\midrule \\
|
||||||
|
\multicolumn{3}{c}{\textbf{ResNet-50}}\\
|
||||||
|
\midrule \\
|
||||||
|
C$_1$ & 7 $\times$ 7 conv, 64, stride 2 & $\tfrac{1}{2}$ H $\times$ $\tfrac{1}{2}$ W $\times$ 64 \\
|
||||||
|
|
||||||
|
& 3 $\times$ 3 max pool, stride 2 & $\tfrac{1}{4}$ H $\times$ $\tfrac{1}{4}$ W $\times$ 64 \\
|
||||||
|
|
||||||
|
C$_2$ &
|
||||||
|
$\begin{bmatrix}
|
||||||
|
1 \times 1, 64 \\
|
||||||
|
3 \times 3, 64 \\
|
||||||
|
1 \times 1, 256 \\
|
||||||
|
\end{bmatrix}_b$ $\times$ 3
|
||||||
|
& $\tfrac{1}{4}$ H $\times$ $\tfrac{1}{4}$ W $\times$ 256 \\
|
||||||
|
\midrule \\
|
||||||
|
C$_3$ &
|
||||||
|
$\begin{bmatrix}
|
||||||
|
1 \times 1, 128 \\
|
||||||
|
3 \times 3, 128 \\
|
||||||
|
1 \times 1, 512 \\
|
||||||
|
\end{bmatrix}_{b/2}$ $\times$ 4
|
||||||
|
& $\tfrac{1}{8}$ H $\times$ $\tfrac{1}{8}$ W $\times$ 512 \\
|
||||||
|
\midrule \\
|
||||||
|
C$_4$ &
|
||||||
|
$\begin{bmatrix}
|
||||||
|
1 \times 1, 256 \\
|
||||||
|
3 \times 3, 256 \\
|
||||||
|
1 \times 1, 1024 \\
|
||||||
|
\end{bmatrix}_{b/2}$ $\times$ 6
|
||||||
|
& $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 1024 \\
|
||||||
|
\midrule \\
|
||||||
|
C$_5$ &
|
||||||
|
$\begin{bmatrix}
|
||||||
|
1 \times 1, 512 \\
|
||||||
|
3 \times 3, 512 \\
|
||||||
|
1 \times 1, 2048 \\
|
||||||
|
\end{bmatrix}_{b/2}$ $\times$ 3
|
||||||
|
& $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 1024 \\
|
||||||
|
|
||||||
|
\bottomrule
|
||||||
|
\end{tabular}
|
||||||
|
|
||||||
|
\caption {
|
||||||
|
ResNet-50 \cite{ResNet} architecture.
|
||||||
|
Operations enclosed in a []$_b$ block make up a single ResNet \enquote{bottleneck}
|
||||||
|
block (see Figure \ref{figure:bottleneck}). If the block is denoted as []$_b/2$,
|
||||||
|
the first conv operation in the block has a stride of 2. Note that the stride
|
||||||
|
is only applied to the first block, but not to repeated blocks.
|
||||||
|
}
|
||||||
|
\label{table:resnet}
|
||||||
|
\end{table}
|
||||||
|
}
|
||||||
|
|
||||||
|
\begin{figure}[t]
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=0.3\textwidth]{figures/bottleneck}
|
||||||
|
\caption{
|
||||||
|
ResNet \cite{ResNet} \enquote{bottleneck} block introduced to reduce computational
|
||||||
|
complexity in deeper network variants, shown here with 256 input and output channels.
|
||||||
|
}
|
||||||
|
\label{figure:bottleneck}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
\subsection{Region-based convolutional networks}
|
\subsection{Region-based convolutional networks}
|
||||||
\label{ssec:rcnn}
|
\label{ssec:rcnn}
|
||||||
We now give a short review of region-based convolutional networks, which are currently by far the
|
We now give an overview of region-based convolutional networks, which are currently by far the
|
||||||
most popular deep networks for object detection, and have recently also been applied to instance segmentation.
|
most popular deep networks for object detection, and have recently also been applied to instance segmentation.
|
||||||
|
|
||||||
\paragraph{R-CNN}
|
\paragraph{R-CNN}
|
||||||
@ -146,6 +223,49 @@ variant based on Feature Pyramid Networks \cite{FPN}.
|
|||||||
Figure \ref{} compares the two Mask R-CNN head variants.
|
Figure \ref{} compares the two Mask R-CNN head variants.
|
||||||
\todo{RoI Align}
|
\todo{RoI Align}
|
||||||
|
|
||||||
|
{
|
||||||
|
\begin{table}[h]
|
||||||
|
\centering
|
||||||
|
\begin{tabular}{llr}
|
||||||
|
layer id & layer operations & output dimensions \\
|
||||||
|
\toprule \\
|
||||||
|
& input image & H $\times$ W $\times$ C \\
|
||||||
|
\midrule \\
|
||||||
|
C$_4$ & \textbf{ResNet-50} [up to C$_4$] & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 1024 \\
|
||||||
|
\midrule \\
|
||||||
|
\multicolumn{3}{c}{\textbf{Region Proposal Network (RPN)}}\\
|
||||||
|
\midrule \\
|
||||||
|
& From C$_4$: 1 $\times$ 1 conv, 512 & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 512 \\
|
||||||
|
& 1 $\times$ 1 conv, 4 & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 4 \\
|
||||||
|
& flatten & A $\times$ 4 \\
|
||||||
|
& decode bounding boxes \ref{} & A $\times$ 4 \\
|
||||||
|
boxes$_{\mathrm{RPN}}$ & sample bounding boxes \ref{} & N$_{RPN}$ $\times$ 4 \\
|
||||||
|
\midrule \\
|
||||||
|
\multicolumn{3}{c}{\textbf{RoI Head}}\\
|
||||||
|
\midrule \\
|
||||||
|
& From C$_4$ with boxes$_{\mathrm{RPN}}$: RoI pooling \ref{} & N$_{RPN}$ $\times$ 7 $\times$ 7 $\times$ 1024 \\
|
||||||
|
R$_1$& \textbf{ResNet-50} [C$_5$ without stride] & N$_{RPN}$ $\times$ 7 $\times$ 7 $\times$ 2048 \\
|
||||||
|
ave & average pool & N$_{RPN}$ $\times$ 2048 \\
|
||||||
|
boxes& From ave: fully connected, 4 & N$_{RPN}$ $\times$ 4 \\
|
||||||
|
logits& From ave: fully connected, N$_{cls}$ & N$_{RPN}$ $\times$ N$_{cls}$ \\
|
||||||
|
\midrule \\
|
||||||
|
\multicolumn{3}{c}{\textbf{RoI Head: Masks}}\\
|
||||||
|
\midrule \\
|
||||||
|
& From R$_1$: 2 $\times$ 2 deconv, 256, stride 2 & N$_{RPN}$ $\times$ 14 $\times$ 14 $\times$ 256 \\
|
||||||
|
masks & 1 $\times$ 1 conv, N$_{cls}$ & N$_{RPN}$ $\times$ 14 $\times$ 14 $\times$ N$_{cls}$ \\
|
||||||
|
|
||||||
|
\bottomrule
|
||||||
|
\end{tabular}
|
||||||
|
|
||||||
|
\caption {
|
||||||
|
Mask R-CNN \cite{MaskRCNN} ResNet \cite{ResNet} architecture.
|
||||||
|
Note that this is equivalent to the Faster R-CNN architecture if the mask
|
||||||
|
head is left out.
|
||||||
|
}
|
||||||
|
\label{table:maskrcnn_resnet}
|
||||||
|
\end{table}
|
||||||
|
}
|
||||||
|
|
||||||
\paragraph{Bounding box regression}
|
\paragraph{Bounding box regression}
|
||||||
All bounding boxes predicted by the RoI head or RPN are estimated as offsets
|
All bounding boxes predicted by the RoI head or RPN are estimated as offsets
|
||||||
with respect to a reference bounding box. In the case of the RPN,
|
with respect to a reference bounding box. In the case of the RPN,
|
||||||
|
|||||||
@ -176,7 +176,7 @@ AEE: Average Endpoint Error; Fl-all: Ratio of pixels where flow estimate is
|
|||||||
wrong by both $\geq 3$ pixels and $\geq 5\%$.
|
wrong by both $\geq 3$ pixels and $\geq 5\%$.
|
||||||
Camera and instance motion errors are averaged over the validation set.
|
Camera and instance motion errors are averaged over the validation set.
|
||||||
We optionally train camera motion prediction (cam.),
|
We optionally train camera motion prediction (cam.),
|
||||||
replace the ResNet50 backbone with ResNet50-FPN (FPN),
|
replace the ResNet-50 backbone with ResNet-50-FPN (FPN),
|
||||||
or input XYZ coordinates into the backbone (XYZ).
|
or input XYZ coordinates into the backbone (XYZ).
|
||||||
We either supervise
|
We either supervise
|
||||||
object motions (sup.) with 3D motion ground truth (3D) or
|
object motions (sup.) with 3D motion ground truth (3D) or
|
||||||
|
|||||||
BIN
figures/bottleneck.png
Normal file
BIN
figures/bottleneck.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 30 KiB |
@ -28,6 +28,7 @@
|
|||||||
\usepackage{lipsum} % zur Erzeugung von Lorem-Ipsum-Blindtext
|
\usepackage{lipsum} % zur Erzeugung von Lorem-Ipsum-Blindtext
|
||||||
\usepackage[math]{blindtext} % zur Erzeugung von deutschem Blindtext
|
\usepackage[math]{blindtext} % zur Erzeugung von deutschem Blindtext
|
||||||
\usepackage{hyperref} % Verlinkungen im Dokument
|
\usepackage{hyperref} % Verlinkungen im Dokument
|
||||||
|
\usepackage{csquotes}
|
||||||
|
|
||||||
% INFO %
|
% INFO %
|
||||||
% Das hyperref-Paket sollte möglichst als letztes geladen werden, darum steht es weiter hinten in der Präambel.
|
% Das hyperref-Paket sollte möglichst als letztes geladen werden, darum steht es weiter hinten in der Präambel.
|
||||||
@ -82,7 +83,7 @@
|
|||||||
|
|
||||||
|
|
||||||
\newcommand{\todo}[1]{\textbf{\textcolor{red}{#1}}}
|
\newcommand{\todo}[1]{\textbf{\textcolor{red}{#1}}}
|
||||||
|
\setlength{\belowrulesep}{0pt}
|
||||||
|
|
||||||
% Titelei
|
% Titelei
|
||||||
\author{\myname}
|
\author{\myname}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user