bsc-thesis/experiments.tex
Simon Meister 84c5b1e6cd save
2017-11-04 12:05:42 +01:00

101 lines
4.2 KiB
TeX

\subsection{Datasets}
\paragraph{Virtual KITTI}
The synthetic Virtual KITTI dataset \cite{VKITTI} is a re-creation of the KITTI
driving scenario \cite{KITTI2012, KITTI2015}, rendered from virtual 3D street
scenes.
The dataset is made up of a total of 2126 frames from five different monocular
sequences recorded from a camera mounted on a virtual car.
Each sequence is rendered with varying lighting and weather conditions and
from different viewing angles, resulting in a total of 10 variants per sequence.
In addition to the RGB frames, a variety of ground truth is supplied.
For each frame, we are given a dense depth and optical flow map and the camera
extrinsics matrix.
For all cars and vans in the each frame, we are given 2D and 3D object bounding
boxes, instance masks, 3D poses, and various other labels.
This makes the Virtual KITTI dataset ideally suited for developing our joint
instance segmentation and motion estimation system, as it allows us to test
different components in isolation and progress to more and more complete
predictions up to supervising the full system on a single dataset.
For our experiments, we use the \emph{clone} sequences, which are rendered in a
way that most closely resembles the original KITTI dataset. We sample 100 examples
to be used as validation set. From the remaining 2026 examples,
we remove a small number of examples without object instances and use the resulting
data as training set.
\paragraph{Motion ground truth from 3D poses and camera extrinsics}
For two consecutive frames $I_t$ and $I_{t+1}$,
let $[R_t^{ex}|t_t^{ex}]$
and $[R_{t+1}^{ex}|t_{t+1}^{ex}]$
be the camera extrinsics at the two frames.
We compute the ground truth camera motion
$\{R_t^{gt, cam}, t_t^{gt, cam}\} \in \mathbf{SE}(3)$ as
\begin{equation}
R_{t}^{gt, cam} = R_{t+1}^{ex} \cdot inv(R_t^{ex}),
\end{equation}
\begin{equation}
t_{t}^{gt, cam} = t_{t+1}^{ex} - R_{t}^{ex} \cdot t_t^{ex}.
\end{equation}
For any object $i$ visible in both frames, let
$(R_t^i, t_t^i)$ and $(R_{t+1}^i, t_{t+1}^i)$
be its orientation and position in camera space
at $I_t$ and $I_{t+1}$.
Note that the pose at $t$ is given with respect to the camera at $t$ and
the pose at $t+1$ is given with respect to the camera at $t+1$.
We define the ground truth pivot $p_{t}^{gt, i} \in \mathbb{R}^3$ as
\begin{equation}
p_{t}^{gt, i} = t_t^i
\end{equation}
and compute the ground truth object motion
$\{R_t^{gt, i}, t_t^{gt, i}\} \in \mathbf{SE}(3)$ as
\begin{equation}
R_{t}^{gt, i} = inv(R_t^{gt, cam}) \cdot R_{t+1}^i \cdot inv(R_t^i),
\end{equation}
\begin{equation}
t_{t}^{gt, i} = t_{t+1}^{i} - R_t^{gt, cam} \cdot t_t.
\end{equation}
\paragraph{Evaluation metrics with motion ground truth}
Given a foreground detection $k$ with an IoU of at least $0.5$ with a ground truth example,
let $i_k$ be the index of the best matching ground truth example,
let $c_k$ be the predicted class,
let $R^{k,c_k}, t^{k,c_k}, p^{k,c_k}$ be the predicted motion for class $c_k$
and $R^{gt,i_k}, t^{gt,i_k}, p^{gt,i_k}$ the ground truth motion for the example $i_k$.
Then, assuming there are $N$ such detections,
\begin{equation}
E_{R} = \frac{1}{N}\sum_k \arccos\left( \min\left\{1, \max\left\{-1, \frac{tr(inv(R^{k,c_k}) \cdot R^{gt,i_k}) - 1}{2} \right\}\right\} \right)
\end{equation}
measures the mean angle of the error rotation between predicted and ground truth rotation,
\begin{equation}
E_{t} = \frac{1}{N}\sum_k \lVert inv(R^{k,c_k}) \cdot (t^{gt,i_k} - t^{k,c_k}) \rVert,
\end{equation}
is the mean euclidean norm between predicted and ground truth translation, and
\begin{equation}
E_{p} = \frac{1}{N}\sum_k \lVert p^{gt,i_k} - p^{k,c_k} \rVert
\end{equation}
is the mean euclidean norm between predicted and ground truth pivot.
\subsection{Training Setup}
Our training schedule is similar to the Mask R-CNN Cityscapes schedule \cite{MaskRCNN}.
We train on a single Titan X (Pascal) for a total of 192K iterations on the
Virtual KITTI training set. As learning rate we use $0.25 \cdot 10^{-2}$ for the
first 144K iterations and $0.25 \cdot 10^{-3}$ for all remaining iterations.
\paragraph{R-CNN training parameters}
\todo{add this}
\subsection{Experiments on Virtual KITTI}
\todo{add this}
\subsection{Evaluation on KITTI 2015}
\todo{add this}