mirror of
https://github.com/tu-darmstadt-informatik/bsc-thesis.git
synced 2025-12-13 09:55:49 +00:00
WIP
This commit is contained in:
parent
65dddcc861
commit
3c294bddc8
49
approach.tex
49
approach.tex
@ -1,5 +1,6 @@
|
|||||||
|
|
||||||
\subsection{Motion R-CNN architecture}
|
\subsection{Motion R-CNN architecture}
|
||||||
|
\label{ssec:architecture}
|
||||||
|
|
||||||
Building on Mask R-CNN \cite{MaskRCNN},
|
Building on Mask R-CNN \cite{MaskRCNN},
|
||||||
we estimate per-object motion by predicting the 3D motion of each detected object.
|
we estimate per-object motion by predicting the 3D motion of each detected object.
|
||||||
@ -76,8 +77,11 @@ Each instance motion is predicted as a set of nine scalar parameters,
|
|||||||
$\sin(\alpha)$, $\sin(\beta)$, $\sin(\gamma)$, $t_t^k$ and $p_t^k$,
|
$\sin(\alpha)$, $\sin(\beta)$, $\sin(\gamma)$, $t_t^k$ and $p_t^k$,
|
||||||
where $\sin(\alpha)$, $\sin(\beta)$ and $\sin(\gamma)$ are clipped to $[-1, 1]$.
|
where $\sin(\alpha)$, $\sin(\beta)$ and $\sin(\gamma)$ are clipped to $[-1, 1]$.
|
||||||
Here, we assume that motions between frames are relatively small
|
Here, we assume that motions between frames are relatively small
|
||||||
and that objects rotate at most 90 degrees in either direction along any axis.
|
and that objects rotate at most 90 degrees in either direction along any axis,
|
||||||
|
which is in general a safe assumption for image sequences from videos.
|
||||||
All predictions are made in camera space, and translation and pivot predictions are in meters.
|
All predictions are made in camera space, and translation and pivot predictions are in meters.
|
||||||
|
We additionally predict softmax scores $o_t^k$ for classifying the objects into
|
||||||
|
still and moving objects.
|
||||||
\todo{figure of head}
|
\todo{figure of head}
|
||||||
|
|
||||||
\paragraph{Camera motion prediction}
|
\paragraph{Camera motion prediction}
|
||||||
@ -86,8 +90,11 @@ between the two frames $I_t$ and $I_{t+1}$.
|
|||||||
For this, we flatten the bottleneck output of the backbone and pass it through a fully connected layer.
|
For this, we flatten the bottleneck output of the backbone and pass it through a fully connected layer.
|
||||||
We again represent $R_t^{cam}$ using a Euler angle representation and
|
We again represent $R_t^{cam}$ using a Euler angle representation and
|
||||||
predict $\sin(\alpha)$, $\sin(\beta)$, $\sin(\gamma)$ and $t_t^{cam}$ in the same way as for the individual objects.
|
predict $\sin(\alpha)$, $\sin(\beta)$, $\sin(\gamma)$ and $t_t^{cam}$ in the same way as for the individual objects.
|
||||||
|
Again, we predict a softmax score $o_t^k$ for classifying differentiating between
|
||||||
|
a still and moving camera.
|
||||||
|
|
||||||
\subsection{Supervision}
|
\subsection{Supervision}
|
||||||
|
\label{ssec:supervision}
|
||||||
|
|
||||||
\paragraph{Per-RoI supervision with 3D motion ground truth}
|
\paragraph{Per-RoI supervision with 3D motion ground truth}
|
||||||
The most straightforward way to supervise the object motions is by using ground truth
|
The most straightforward way to supervise the object motions is by using ground truth
|
||||||
@ -97,32 +104,47 @@ Given the $k$-th positive RoI, let $i_k$ be the index of the matched ground trut
|
|||||||
let $R^{k,c_k}, t^{k,c_k}, p^{k,c_k}$ be the predicted motion for class $c_k$
|
let $R^{k,c_k}, t^{k,c_k}, p^{k,c_k}$ be the predicted motion for class $c_k$
|
||||||
and $R^{gt,i_k}, t^{gt,i_k}, p^{gt,i_k}$ the ground truth motion for the example $i_k$.
|
and $R^{gt,i_k}, t^{gt,i_k}, p^{gt,i_k}$ the ground truth motion for the example $i_k$.
|
||||||
Note that we dropped the subscript $t$ to increase readability.
|
Note that we dropped the subscript $t$ to increase readability.
|
||||||
Inspired by the camera pose regression loss in \cite{PoseNet2},
|
Similar to the camera pose regression loss in \cite{PoseNet2},
|
||||||
we use an $\ell_1$-loss to penalize the differences between ground truth and predicted % TODO actually, we use smooth l1
|
we use a variant of the $\ell_1$-loss to penalize the differences between ground truth and predicted
|
||||||
rotation, translation and pivot.
|
rotation, translation (and pivot, in our case). We found that the smooth $\ell_1$-loss
|
||||||
|
performs better in our case than the standard $\ell_1$-loss.
|
||||||
For each RoI, we compute the motion loss $L_{motion}^k$ as a linear sum of
|
For each RoI, we compute the motion loss $L_{motion}^k$ as a linear sum of
|
||||||
the individual losses,
|
the individual losses,
|
||||||
|
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
L_{motion}^k =l_{R}^k + l_{t}^k + l_{p}^k,
|
L_{motion}^k = l_{p}^k + (l_{R}^k + l_{t}^k) \cdot o^{gt,i_k} + l_o^k,
|
||||||
\end{equation}
|
\end{equation}
|
||||||
where
|
where
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
l_{R}^k = \lVert R^{gt,i_k} - R^{k,c_k} \rVert _1,
|
l_{R}^k = \ell_1^* (R^{gt,i_k} - R^{k,c_k}),
|
||||||
\end{equation}
|
\end{equation}
|
||||||
|
\begin{equation}
|
||||||
|
l_{t}^k = \ell_1^* (t^{gt,i_k} - t^{k,c_k}),
|
||||||
|
\end{equation}
|
||||||
|
\begin{equation}
|
||||||
|
l_{p}^k = \ell_1^* (p^{gt,i_k} - p^{k,c_k}).
|
||||||
|
\end{equation}
|
||||||
|
are the smooth $\ell_1$-losses for the predicted rotation, translation and pivot,
|
||||||
|
respectively and
|
||||||
|
\begin{equation}
|
||||||
|
l_o^k = \ell_{cls}(o_t^k, o^{gt,i_k}).
|
||||||
|
\end{equation}
|
||||||
|
is the cross-entropy loss for the predicted classification into moving and non-moving objects.
|
||||||
|
|
||||||
\begin{equation}
|
Note that we do not penalize the rotation and translation for objects with
|
||||||
l_{t}^k = \lVert t^{gt,i_k} - t^{k,c_k} \rVert_1,
|
$o^{gt,i_k} = 0$, which do not move between $t$ and $t+1$. We found that the network
|
||||||
\end{equation}
|
may not reliably predict exact identity motions for still objects, which is
|
||||||
and
|
numerically more difficult to optimize than performing classification between
|
||||||
\begin{equation}
|
moving and non-moving objects and discarding the regression for the non-moving
|
||||||
l_{p}^k = \lVert p^{gt,i_k} - p^{k,c_k} \rVert_1.
|
ones.
|
||||||
\end{equation}
|
|
||||||
|
|
||||||
\paragraph{Camera motion supervision}
|
\paragraph{Camera motion supervision}
|
||||||
We supervise the camera motion with ground truth analogously to the
|
We supervise the camera motion with ground truth analogously to the
|
||||||
object motions, with the only difference being that we only have
|
object motions, with the only difference being that we only have
|
||||||
a rotation and translation, but no pivot term for the camera motion.
|
a rotation and translation, but no pivot term for the camera motion.
|
||||||
|
If the ground truth shows that the camera is not moving, we again do not
|
||||||
|
penalize rotation and translation. For the camera, the loss is reduced to the
|
||||||
|
classification term in this case.
|
||||||
|
|
||||||
\paragraph{Per-RoI supervision \emph{without} 3D motion ground truth}
|
\paragraph{Per-RoI supervision \emph{without} 3D motion ground truth}
|
||||||
A more general way to supervise the object motions is a re-projection
|
A more general way to supervise the object motions is a re-projection
|
||||||
@ -166,6 +188,7 @@ which can make it interesting even when 3D motion ground truth is available.
|
|||||||
|
|
||||||
|
|
||||||
\subsection{Dense flow from motion}
|
\subsection{Dense flow from motion}
|
||||||
|
\label{ssec:postprocessing}
|
||||||
As a postprocessing, we compose a dense optical flow map from the outputs of our Motion R-CNN network.
|
As a postprocessing, we compose a dense optical flow map from the outputs of our Motion R-CNN network.
|
||||||
Given the depth map $d_t$ for frame $I_t$, we first create a 3D point cloud in camera space at time $t$,
|
Given the depth map $d_t$ for frame $I_t$, we first create a 3D point cloud in camera space at time $t$,
|
||||||
where
|
where
|
||||||
|
|||||||
@ -1,6 +1,22 @@
|
|||||||
In this section, we will give a more detailed description of previous works
|
In this section, we will give a more detailed description of previous works
|
||||||
we directly build on and other prerequisites.
|
we directly build on and other prerequisites.
|
||||||
|
|
||||||
|
\subsection{Basic definitions}
|
||||||
|
For regression, we define the smooth $\ell_1$-loss as
|
||||||
|
\begin{equation}
|
||||||
|
\ell_1^*(x) =
|
||||||
|
\begin{cases}
|
||||||
|
0.5x^2 &\text{if |x| < 1} \\
|
||||||
|
|x| - 0.5 &\text{otherwise,}
|
||||||
|
\end{cases}
|
||||||
|
\end{equation}
|
||||||
|
which provides a certain robustness to outliers and will be used
|
||||||
|
frequently in the following chapters.
|
||||||
|
For classification we define the cross-entropy loss as
|
||||||
|
\begin{equation}
|
||||||
|
\ell_{cls} =
|
||||||
|
\end{equation}
|
||||||
|
|
||||||
\subsection{Optical flow and scene flow}
|
\subsection{Optical flow and scene flow}
|
||||||
Let $I_1,I_2 : P \to \mathbb{R}^3$ be two temporally consecutive frames in a
|
Let $I_1,I_2 : P \to \mathbb{R}^3$ be two temporally consecutive frames in a
|
||||||
sequence of images.
|
sequence of images.
|
||||||
@ -43,7 +59,17 @@ Note that the maximum displacement that can be correctly estimated only depends
|
|||||||
operations in the encoder.
|
operations in the encoder.
|
||||||
Recently, other encoder-decoder CNNs have been applied to optical flow as well \cite{DenseNetDenseFlow}.
|
Recently, other encoder-decoder CNNs have been applied to optical flow as well \cite{DenseNetDenseFlow}.
|
||||||
|
|
||||||
|
\subsection{SfM-Net}
|
||||||
|
Here, we will describe the SfM-Net architecture in more detail and show their results
|
||||||
|
and some of the issues.
|
||||||
|
|
||||||
|
\subsection{ResNet}
|
||||||
|
\label{ssec:resnet}
|
||||||
|
For completeness, we will give a short review of the ResNet \cite{ResNet} architecture we will use
|
||||||
|
as a backbone CNN for our network.
|
||||||
|
|
||||||
\subsection{Region-based convolutional networks}
|
\subsection{Region-based convolutional networks}
|
||||||
|
\label{ssec:rcnn}
|
||||||
We now give a short review of region-based convolutional networks, which are currently by far the
|
We now give a short review of region-based convolutional networks, which are currently by far the
|
||||||
most popular deep networks for object detection, and have recently also been applied to instance segmentation.
|
most popular deep networks for object detection, and have recently also been applied to instance segmentation.
|
||||||
|
|
||||||
@ -51,7 +77,7 @@ most popular deep networks for object detection, and have recently also been app
|
|||||||
Region-based convolutional networks (R-CNNs) \cite{RCNN} use a non-learned algorithm external to a standard encoder CNN
|
Region-based convolutional networks (R-CNNs) \cite{RCNN} use a non-learned algorithm external to a standard encoder CNN
|
||||||
for computing \emph{region proposals} in the shape of 2D bounding boxes, which represent regions that may contain an object.
|
for computing \emph{region proposals} in the shape of 2D bounding boxes, which represent regions that may contain an object.
|
||||||
For each of the region proposals, the input image is cropped using the regions bounding box and the crop is
|
For each of the region proposals, the input image is cropped using the regions bounding box and the crop is
|
||||||
passed through a CNN, which performs classification of the object (or non-object, if the region shows background). % and box refinement!
|
passed through a CNN, which performs classification of the object (or non-object, if the region shows background).
|
||||||
|
|
||||||
\paragraph{Fast R-CNN}
|
\paragraph{Fast R-CNN}
|
||||||
The original R-CNN involves computing one forward pass of the CNN for each of the region proposals,
|
The original R-CNN involves computing one forward pass of the CNN for each of the region proposals,
|
||||||
@ -120,6 +146,69 @@ variant based on Feature Pyramid Networks \cite{FPN}.
|
|||||||
Figure \ref{} compares the two Mask R-CNN head variants.
|
Figure \ref{} compares the two Mask R-CNN head variants.
|
||||||
\todo{RoI Align}
|
\todo{RoI Align}
|
||||||
|
|
||||||
|
\paragraph{Bounding box regression}
|
||||||
|
All bounding boxes predicted by the RoI head or RPN are estimated as offsets
|
||||||
|
with respect to a reference bounding box. In the case of the RPN,
|
||||||
|
the reference bounding box is one of the anchors, and refined bounding boxes from the RoI head are
|
||||||
|
predicted relative to the RPN output bounding boxes.
|
||||||
|
Let $(x, y, w, h)$ be the top left coordinates, height and width of the bounding box
|
||||||
|
to be predicted. Likewise, let $(x^*, y^*, w^*, h^*)$ be the ground truth bounding
|
||||||
|
box and let $(x_r, y_r, w_r, h_r)$ be the reference bounding box.
|
||||||
|
We then define the ground truth \emph{box encoding} $b^*$ as
|
||||||
|
\begin{equation*}
|
||||||
|
b^* = (b_x^*, b_y^*, b_w^*, b_h^*),
|
||||||
|
\end{equation*}
|
||||||
|
where
|
||||||
|
\begin{equation*}
|
||||||
|
b_x^* = \frac{x^* - x_r}{w_r},
|
||||||
|
\end{equation*}
|
||||||
|
\begin{equation*}
|
||||||
|
b_y^* = \frac{y^* - y_r}{h_r}
|
||||||
|
\end{equation*}
|
||||||
|
\begin{equation*}
|
||||||
|
b_w^* = \log \left( \frac{w^*}{w_r} \right)
|
||||||
|
\end{equation*}
|
||||||
|
\begin{equation*}
|
||||||
|
b_h^* = \log \left( \frac{h^*}{h_r} \right),
|
||||||
|
\end{equation*}
|
||||||
|
which represents the regression target for the bounding box refinement
|
||||||
|
outputs of the network.
|
||||||
|
|
||||||
|
In the same way, we define the predicted box encoding $b$ as
|
||||||
|
\begin{equation*}
|
||||||
|
(b_x, b_y, b_w, b_h),
|
||||||
|
\end{equation*}
|
||||||
|
where
|
||||||
|
\begin{equation*}
|
||||||
|
b_x = \frac{x - x_r}{w_r},
|
||||||
|
\end{equation*}
|
||||||
|
\begin{equation*}
|
||||||
|
b_y = \frac{y - y_r}{h_r}
|
||||||
|
\end{equation*}
|
||||||
|
\begin{equation*}
|
||||||
|
b_w = \log \left( \frac{w}{w_r} \right)
|
||||||
|
\end{equation*}
|
||||||
|
\begin{equation*}
|
||||||
|
b_h = \log \left( \frac{h}{h_r} \right).
|
||||||
|
\end{equation*}
|
||||||
|
|
||||||
|
At test time, to get from a predicted box encoding $(b_x, b_y, b_w, b_h)$ to the actual bounding box $(x, y, w, h)$,
|
||||||
|
we invert the definitions above,
|
||||||
|
\begin{equation*}
|
||||||
|
x = b_x \cdot w_r + x_r,
|
||||||
|
\end{equation*}
|
||||||
|
\begin{equation*}
|
||||||
|
y = b_y \cdot b_r + y_r,
|
||||||
|
\end{equation*}
|
||||||
|
\begin{equation*}
|
||||||
|
w = \exp(b_w) \cdot w_r,
|
||||||
|
\end{equation*}
|
||||||
|
\begin{equation*}
|
||||||
|
h = \exp(b_h) \cdot h_r,
|
||||||
|
\end{equation*}
|
||||||
|
and thus obtain the bounding box as the reference bounding box adjusted by
|
||||||
|
the predicted relative offsets and scales.
|
||||||
|
|
||||||
|
|
||||||
\paragraph{Supervision of the RPN}
|
\paragraph{Supervision of the RPN}
|
||||||
\todo{TODO}
|
\todo{TODO}
|
||||||
|
|||||||
@ -14,12 +14,15 @@ for real time scenarios.
|
|||||||
We thus presented a step towards real time 3D motion estimation based on a
|
We thus presented a step towards real time 3D motion estimation based on a
|
||||||
physically sound scene decomposition. Thanks to instance-level reasoning, in contrast
|
physically sound scene decomposition. Thanks to instance-level reasoning, in contrast
|
||||||
to previous end-to-end deep networks for dense motion estimation, the output
|
to previous end-to-end deep networks for dense motion estimation, the output
|
||||||
of our network is highly interpretable, which may bring benefits for safety-critical
|
of our network is highly interpretable, which may also bring benefits for safety-critical
|
||||||
applications.
|
applications.
|
||||||
|
|
||||||
\subsection{Future Work}
|
\subsection{Future Work}
|
||||||
\paragraph{Predicting depth}
|
\paragraph{Predicting depth}
|
||||||
In most cases, we want to work with raw RGB sequences for which no depth is available.
|
In this work, we focused on motion estimation when RGB-D frames with dense depth are available.
|
||||||
|
However, in many applications settings, we are not provided with any depth information.
|
||||||
|
In most cases, we want to work with raw RGB sequences from one or multiple simple cameras,
|
||||||
|
from which no depth data is available.
|
||||||
To do so, we could integrate depth prediction into our network by branching off a
|
To do so, we could integrate depth prediction into our network by branching off a
|
||||||
depth network from the backbone in parallel to the RPN, as in Figure \ref{}.
|
depth network from the backbone in parallel to the RPN, as in Figure \ref{}.
|
||||||
Although single-frame monocular depth prediction with deep networks was already done
|
Although single-frame monocular depth prediction with deep networks was already done
|
||||||
|
|||||||
@ -14,6 +14,7 @@ we use the \texttt{tf.crop\_and\_resize} TensorFlow function with
|
|||||||
interpolation set to bilinear.
|
interpolation set to bilinear.
|
||||||
|
|
||||||
\subsection{Datasets}
|
\subsection{Datasets}
|
||||||
|
\label{ssec:datasets}
|
||||||
|
|
||||||
\paragraph{Virtual KITTI}
|
\paragraph{Virtual KITTI}
|
||||||
The synthetic Virtual KITTI dataset \cite{VKITTI} is a re-creation of the KITTI
|
The synthetic Virtual KITTI dataset \cite{VKITTI} is a re-creation of the KITTI
|
||||||
@ -54,12 +55,22 @@ We compute the ground truth camera motion
|
|||||||
$\{R_t^{gt, cam}, t_t^{gt, cam}\} \in \mathbf{SE}(3)$ as
|
$\{R_t^{gt, cam}, t_t^{gt, cam}\} \in \mathbf{SE}(3)$ as
|
||||||
|
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
R_{t}^{gt, cam} = R_{t+1}^{ex} \cdot inv(R_t^{ex}),
|
R_{t}^{gt, cam} = R_{t+1}^{ex} \cdot \mathrm{inv}(R_t^{ex}),
|
||||||
\end{equation}
|
\end{equation}
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
t_{t}^{gt, cam} = t_{t+1}^{ex} - R_{t}^{ex} \cdot t_t^{ex}.
|
t_{t}^{gt, cam} = t_{t+1}^{ex} - R_{t}^{ex} \cdot t_t^{ex}.
|
||||||
\end{equation}
|
\end{equation}
|
||||||
|
|
||||||
|
Additionally, we define $o_t^{gt, cam} \in \{ 0, 1 \}$,
|
||||||
|
\begin{equation}
|
||||||
|
o_t^{gt, cam} =
|
||||||
|
\begin{cases}
|
||||||
|
1 &\text{if the camera pose changes between $t$ and $t+1$} \\
|
||||||
|
0 &\text{otherwise,}
|
||||||
|
\end{cases}
|
||||||
|
\end{equation}
|
||||||
|
which specifies the camera is moving in between the frames.
|
||||||
|
|
||||||
For any object $i$ visible in both frames, let
|
For any object $i$ visible in both frames, let
|
||||||
$(R_t^i, t_t^i)$ and $(R_{t+1}^i, t_{t+1}^i)$
|
$(R_t^i, t_t^i)$ and $(R_{t+1}^i, t_{t+1}^i)$
|
||||||
be its orientation and position in camera space
|
be its orientation and position in camera space
|
||||||
@ -77,12 +88,22 @@ and compute the ground truth object motion
|
|||||||
$\{R_t^{gt, i}, t_t^{gt, i}\} \in \mathbf{SE}(3)$ as
|
$\{R_t^{gt, i}, t_t^{gt, i}\} \in \mathbf{SE}(3)$ as
|
||||||
|
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
R_{t}^{gt, i} = inv(R_t^{gt, cam}) \cdot R_{t+1}^i \cdot inv(R_t^i),
|
R_{t}^{gt, i} = \mathrm{inv}(R_t^{gt, cam}) \cdot R_{t+1}^i \cdot \mathrm{inv}(R_t^i),
|
||||||
\end{equation}
|
\end{equation}
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
t_{t}^{gt, i} = t_{t+1}^{i} - R_t^{gt, cam} \cdot t_t.
|
t_{t}^{gt, i} = t_{t+1}^{i} - R_t^{gt, cam} \cdot t_t.
|
||||||
\end{equation}
|
\end{equation}
|
||||||
|
|
||||||
|
As for the camera, we define $o_t^{gt, i} \in \{ 0, 1 \}$,
|
||||||
|
\begin{equation}
|
||||||
|
o_t^{gt, i} =
|
||||||
|
\begin{cases}
|
||||||
|
1 &\text{if the position of object i changes between $t$ and $t+1$} \\
|
||||||
|
0 &\text{otherwise,}
|
||||||
|
\end{cases}
|
||||||
|
\end{equation}
|
||||||
|
which specifies whether an object is moving in between the frames.
|
||||||
|
|
||||||
\paragraph{Evaluation metrics with motion ground truth}
|
\paragraph{Evaluation metrics with motion ground truth}
|
||||||
To evaluate the 3D instance and camera motions on the Virtual KITTI validation
|
To evaluate the 3D instance and camera motions on the Virtual KITTI validation
|
||||||
set, we introduce a few error metrics.
|
set, we introduce a few error metrics.
|
||||||
@ -93,21 +114,22 @@ let $R^{k,c_k}, t^{k,c_k}, p^{k,c_k}$ be the predicted motion for class $c_k$
|
|||||||
and $R^{gt,i_k}, t^{gt,i_k}, p^{gt,i_k}$ the ground truth motion for the example $i_k$.
|
and $R^{gt,i_k}, t^{gt,i_k}, p^{gt,i_k}$ the ground truth motion for the example $i_k$.
|
||||||
Then, assuming there are $N$ such detections,
|
Then, assuming there are $N$ such detections,
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
E_{R} = \frac{1}{N}\sum_k \arccos\left( \min\left\{1, \max\left\{-1, \frac{tr(inv(R^{k,c_k}) \cdot R^{gt,i_k}) - 1}{2} \right\}\right\} \right)
|
E_{R} = \frac{1}{N}\sum_k \arccos\left( \min\left\{1, \max\left\{-1, \frac{tr(\mathrm{inv}(R^{k,c_k}) \cdot R^{gt,i_k}) - 1}{2} \right\}\right\} \right)
|
||||||
\end{equation}
|
\end{equation}
|
||||||
measures the mean angle of the error rotation between predicted and ground truth rotation,
|
measures the mean angle of the error rotation between predicted and ground truth rotation,
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
E_{t} = \frac{1}{N}\sum_k \lVert inv(R^{k,c_k}) \cdot (t^{gt,i_k} - t^{k,c_k}) \rVert,
|
E_{t} = \frac{1}{N}\sum_k \left\lVert \mathrm{inv}(R^{k,c_k}) \cdot (t^{gt,i_k} - t^{k,c_k}) \right\rVert_2,
|
||||||
\end{equation}
|
\end{equation}
|
||||||
is the mean euclidean norm between predicted and ground truth translation, and
|
is the mean euclidean norm between predicted and ground truth translation, and
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
E_{p} = \frac{1}{N}\sum_k \lVert p^{gt,i_k} - p^{k,c_k} \rVert
|
E_{p} = \frac{1}{N}\sum_k \left\lVert p^{gt,i_k} - p^{k,c_k} \right\rVert_2
|
||||||
\end{equation}
|
\end{equation}
|
||||||
is the mean euclidean norm between predicted and ground truth pivot.
|
is the mean euclidean norm between predicted and ground truth pivot.
|
||||||
Analogously, we define error metrics $E_{R}^{cam}$ and $E_{t}^{cam}$ for
|
Analogously, we define error metrics $E_{R}^{cam}$ and $E_{t}^{cam}$ for
|
||||||
predicted camera motions.
|
predicted camera motions.
|
||||||
|
|
||||||
\subsection{Training Setup}
|
\subsection{Training Setup}
|
||||||
|
\label{ssec:setup}
|
||||||
Our training schedule is similar to the Mask R-CNN Cityscapes schedule \cite{MaskRCNN}.
|
Our training schedule is similar to the Mask R-CNN Cityscapes schedule \cite{MaskRCNN}.
|
||||||
We train on a single Titan X (Pascal) for a total of 192K iterations on the
|
We train on a single Titan X (Pascal) for a total of 192K iterations on the
|
||||||
Virtual KITTI training set.
|
Virtual KITTI training set.
|
||||||
@ -120,6 +142,7 @@ first 144K iterations and $0.25 \cdot 10^{-3}$ for all remaining iterations.
|
|||||||
\todo{add this}
|
\todo{add this}
|
||||||
|
|
||||||
\subsection{Experiments on Virtual KITTI}
|
\subsection{Experiments on Virtual KITTI}
|
||||||
|
\label{ssec:vkitti}
|
||||||
|
|
||||||
\begin{figure}[t]
|
\begin{figure}[t]
|
||||||
\centering
|
\centering
|
||||||
|
|||||||
@ -9,7 +9,7 @@ and estimates their 3D locations as well as all 3D object motions between the fr
|
|||||||
\label{figure:teaser}
|
\label{figure:teaser}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
\subsection{Motivation \& Goals}
|
\subsection{Motivation}
|
||||||
|
|
||||||
For moving in the real world, it is generally desirable to know which objects exists
|
For moving in the real world, it is generally desirable to know which objects exists
|
||||||
in the proximity of the moving agent,
|
in the proximity of the moving agent,
|
||||||
@ -35,7 +35,7 @@ sequences of images, segment the image pixels into object instances and estimate
|
|||||||
the location and 3D motion of each object instance relative to the camera
|
the location and 3D motion of each object instance relative to the camera
|
||||||
(Figure \ref{figure:teaser}).
|
(Figure \ref{figure:teaser}).
|
||||||
|
|
||||||
\subsection{Technical outline}
|
\subsection{Technical goals}
|
||||||
|
|
||||||
Recently, SfM-Net \cite{SfmNet} introduced an end-to-end deep learning approach for predicting depth
|
Recently, SfM-Net \cite{SfmNet} introduced an end-to-end deep learning approach for predicting depth
|
||||||
and dense optical flow in monocular image sequences based on estimating the 3D motion of individual objects and the camera.
|
and dense optical flow in monocular image sequences based on estimating the 3D motion of individual objects and the camera.
|
||||||
@ -178,3 +178,25 @@ a single RGB frame \cite{PoseNet, PoseNet2}. These works are related to
|
|||||||
ours in that we also need to output various rotations and translations from a deep network
|
ours in that we also need to output various rotations and translations from a deep network
|
||||||
and thus need to solve similar regression problems and use similar parametrizations
|
and thus need to solve similar regression problems and use similar parametrizations
|
||||||
and losses.
|
and losses.
|
||||||
|
|
||||||
|
|
||||||
|
\subsection{Outline}
|
||||||
|
In section \ref{sec:background}, we introduce preliminaries and building
|
||||||
|
blocks from earlier works that serve as a foundation for our networks and losses.
|
||||||
|
Most importantly, we re-view the ResNet CNN (\ref{ssec:resnet}) that will serve as CNN backbone
|
||||||
|
as well as the developments in region-based CNNs onto which we build (\ref{ssec:rcnn}),
|
||||||
|
specifically Mask R-CNN and the FPN \cite{FPN}.
|
||||||
|
In section \ref{sec:approach}, we describe our technical contribution, starting
|
||||||
|
with our modifications to the Mask R-CNN backbone and head networks (\ref{ssec:architecture}),
|
||||||
|
followed by our losses and supervision methods for training
|
||||||
|
the extended region-based CNN (\ref{ssec:supervision}), and
|
||||||
|
finally the postprocessings we use to derive dense flow from our 3D motion estimates
|
||||||
|
(\ref{ssec:postprocessing}).
|
||||||
|
In section \ref{sec:experiments}, we introduce the Virtual KITTI dataset we use
|
||||||
|
for training our networks as well as all preprocessings we perform (\ref{ssec:datasets}),
|
||||||
|
give details of our experimental setup (\ref{ssec:setup}),
|
||||||
|
and finally describe the experimental results
|
||||||
|
on Virtual KITTI (\ref{ssec:vkitti}).
|
||||||
|
In section \ref{sec:conclusion}, we summarize our work and describe future
|
||||||
|
developments, including depth prediction, training on real world data,
|
||||||
|
and exploiting frames over longer time intervals.
|
||||||
|
|||||||
@ -123,17 +123,20 @@
|
|||||||
\onehalfspacing
|
\onehalfspacing
|
||||||
|
|
||||||
\input{introduction}
|
\input{introduction}
|
||||||
|
\label{sec:introduction}
|
||||||
|
|
||||||
\section{Background}
|
\section{Background}
|
||||||
\parindent 2em
|
\parindent 2em
|
||||||
\onehalfspacing
|
\onehalfspacing
|
||||||
|
|
||||||
|
\label{sec:background}
|
||||||
\input{background}
|
\input{background}
|
||||||
|
|
||||||
\section{Motion R-CNN}
|
\section{Motion R-CNN}
|
||||||
\parindent 2em
|
\parindent 2em
|
||||||
\onehalfspacing
|
\onehalfspacing
|
||||||
|
|
||||||
|
\label{sec:approach}
|
||||||
\input{approach}
|
\input{approach}
|
||||||
|
|
||||||
\section{Experiments}
|
\section{Experiments}
|
||||||
@ -141,12 +144,14 @@
|
|||||||
\onehalfspacing
|
\onehalfspacing
|
||||||
|
|
||||||
\input{experiments}
|
\input{experiments}
|
||||||
|
\label{sec:experiments}
|
||||||
|
|
||||||
\section{Conclusion}
|
\section{Conclusion}
|
||||||
\parindent 2em
|
\parindent 2em
|
||||||
\onehalfspacing
|
\onehalfspacing
|
||||||
|
|
||||||
\input{conclusion}
|
\input{conclusion}
|
||||||
|
\label{sec:conclusion}
|
||||||
|
|
||||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
% Bibliografie mit BibLaTeX
|
% Bibliografie mit BibLaTeX
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user