mirror of
https://github.com/tu-darmstadt-informatik/bsc-thesis.git
synced 2025-12-12 17:35:51 +00:00
462 lines
22 KiB
TeX
462 lines
22 KiB
TeX
|
|
\subsection{Motion R-CNN}
|
|
\label{ssec:model}
|
|
|
|
Building on Mask R-CNN \cite{MaskRCNN},
|
|
we estimate per-object motion by predicting the 3D motion of each detected object.
|
|
For this, we extend Mask R-CNN in two straightforward ways.
|
|
First, we modify the backbone network and provide it with two frames
|
|
in order to enable image matching between the consecutive frames.
|
|
Second, we extend the Mask R-CNN RoI head to predict a 3D motion and pivot for each
|
|
region proposal. Tables \ref{table:motionrcnn_resnet} and \ref{table:motionrcnn_resnet_fpn}
|
|
show our Motion R-CNN networks based on Mask R-CNN ResNet and Mask R-CNN ResNet-FPN,
|
|
respectively.
|
|
|
|
{\begin{table}[h]
|
|
\centering
|
|
\begin{tabular}{llr}
|
|
\toprule
|
|
\textbf{Output} & \textbf{Layer Operations} & \textbf{Output Dimensions} \\
|
|
\midrule\midrule
|
|
& input images $I_t$, $I_{t+1}$, and (optional) XYZ$_{t}$, XYZ$_{t+1}$ & H $\times$ W $\times$ C \\
|
|
\midrule
|
|
C$_4$ & ResNet \{up to C$_4$\} (Table \ref{table:resnet}) & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 1024 \\
|
|
\midrule
|
|
\multicolumn{3}{c}{\textbf{Region Proposal Network (RPN)} (Table \ref{table:maskrcnn_resnet})}\\
|
|
\midrule
|
|
\multicolumn{3}{c}{\textbf{Camera Motion Network}}\\
|
|
\midrule
|
|
& From C$_4$: ResNet \{C$_5$\} (Table \ref{table:resnet}) & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 2048 \\
|
|
& ResNet \{C$_6$\} (Table \ref{table:resnet}) & $\tfrac{1}{64}$ H $\times$ $\tfrac{1}{64}$ W $\times$ 2048 \\
|
|
& bilinear resize, 7 $\times$ 7 & 7 $\times$ 7 $\times$ 512 \\
|
|
& flatten & 1 $\times$ 7 $\cdot$ 7 $\cdot$ 512 \\
|
|
T$_0$ & $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & 1 $\times$ 1024 \\
|
|
|
|
$R_{cam}$& From T$_0$: fully connected, 3 & 1 $\times$ 3 \\
|
|
$t_{cam}$& From T$_0$: fully connected, 3 & 1 $\times$ 3 \\
|
|
& From T$_0$: fully connected, 2 & 1 $\times$ 2 \\
|
|
$o_{cam}$& softmax, 2 & 1 $\times$ 2 \\
|
|
\midrule
|
|
\multicolumn{3}{c}{\textbf{RoI Head \& RoI Head: Masks} (Table \ref{table:maskrcnn_resnet})}\\
|
|
\midrule
|
|
\multicolumn{3}{c}{\textbf{RoI Head: Motions}}\\
|
|
\midrule
|
|
%& From M$_0$: flatten & N$_{RoI}$ $\times$ 7 $\cdot$ 7 $\cdot$ 256 \\
|
|
T$_1$ & From ave: $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & N$_{RoI}$ $\times$ 1024 \\
|
|
$\forall k: R_k$ & From T$_1$: fully connected, 3 & N$_{RoI}$ $\times$ 3 \\
|
|
$\forall k: t_k$ & From T$_1$: fully connected, 3 & N$_{RoI}$ $\times$ 3 \\
|
|
$\forall k: p_k$ & From T$_1$: fully connected, 3 & N$_{RoI}$ $\times$ 3 \\
|
|
& From T$_1$: fully connected, 2 & N$_{RoI}$ $\times$ 2 \\
|
|
$\forall k: o_k$ & softmax, 2 & N$_{RoI}$ $\times$ 2 \\
|
|
|
|
\bottomrule
|
|
\end{tabular}
|
|
|
|
\caption {
|
|
Motion R-CNN ResNet architecture based on the Mask R-CNN
|
|
ResNet architecture (Table \ref{table:maskrcnn_resnet}).
|
|
We use ReLU activations after all hidden layers and
|
|
additonally dropout with $p = 0.5$ after all fully-connected hidden layers.
|
|
}
|
|
\label{table:motionrcnn_resnet}
|
|
\end{table}
|
|
}
|
|
|
|
{
|
|
\begin{table}[h]
|
|
\centering
|
|
\begin{tabular}{llr}
|
|
\toprule
|
|
\textbf{Output} & \textbf{Layer Operations} & \textbf{Output Dimensions} \\
|
|
\midrule\midrule
|
|
& input images $I_t$, $I_{t+1}$, and (optional) XYZ$_{t}$, XYZ$_{t+1}$ & H $\times$ W $\times$ C \\
|
|
\midrule
|
|
C$_6$ & ResNet (Table \ref{table:resnet}) & $\tfrac{1}{64}$ H $\times$ $\tfrac{1}{64}$ W $\times$ 2048 \\
|
|
\midrule
|
|
\multicolumn{3}{c}{\textbf{RPN \& FPN} (Table \ref{table:maskrcnn_resnet_fpn})} \\
|
|
\midrule
|
|
\multicolumn{3}{c}{\textbf{Camera Motion Network}}\\
|
|
\midrule
|
|
& From C$_6$: 1 $\times$ 1 conv, 512 & $\tfrac{1}{64}$ H $\times$ $\tfrac{1}{64}$ W $\times$ 512 \\
|
|
& bilinear resize, 7 $\times$ 7 & 7 $\times$ 7 $\times$ 512 \\
|
|
& flatten & 1 $\times$ 7 $\cdot$ 7 $\cdot$ 512 \\
|
|
T$_2$ & $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & 1 $\times$ 1024 \\
|
|
$R_{cam}$& From T$_2$: fully connected, 3 & 1 $\times$ 3 \\
|
|
$t_{cam}$& From T$_2$: fully connected, 3 & 1 $\times$ 3 \\
|
|
& From T$_2$: fully connected, 2 & 1 $\times$ 2 \\
|
|
$o_{cam}$& softmax, 2 & 1 $\times$ 2 \\
|
|
\midrule
|
|
\multicolumn{3}{c}{\textbf{RoI Head \& RoI Head: Masks} (Table \ref{table:maskrcnn_resnet_fpn})} \\
|
|
\midrule
|
|
\multicolumn{3}{c}{\textbf{RoI Head: Motions}}\\
|
|
\midrule
|
|
%& From M$_1$: flatten & N$_{RoI}$ $\times$ 14 $\cdot$ 14 $\cdot$ 256 \\
|
|
T$_3$ & From F$_1$: $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & N$_{RoI}$ $\times$ 1024 \\
|
|
$\forall k: R_k$ & From T$_3$: fully connected, 3 & N$_{RoI}$ $\times$ 3 \\
|
|
$\forall k: t_k$ & From T$_3$: fully connected, 3 & N$_{RoI}$ $\times$ 3 \\
|
|
$\forall k: p_k$ & From T$_3$: fully connected, 3 & N$_{RoI}$ $\times$ 3 \\
|
|
& From T$_2$: fully connected, 2 & N$_{RoI}$ $\times$ 2 \\
|
|
$\forall k: o_k$ & softmax, 2 & N$_{RoI}$ $\times$ 2 \\
|
|
|
|
\bottomrule
|
|
\end{tabular}
|
|
|
|
\caption {
|
|
Motion R-CNN ResNet-FPN architecture based on the Mask R-CNN
|
|
ResNet-FPN architecture (Table \ref{table:maskrcnn_resnet_fpn}).
|
|
To obtain a larger bottleneck stride, we compute the feature pyramid starting
|
|
with C$_6$ instead of C$_5$, and thus, the subsampling from P$_5$ to P$_6$ is omitted.
|
|
The modifications are analogous to our Motion R-CNN ResNet,
|
|
but we still show the architecture for completeness.
|
|
Again, we use ReLU activations after all hidden layers and
|
|
additonally dropout with $p = 0.5$ after all fully-connected hidden layers.
|
|
}
|
|
\label{table:motionrcnn_resnet_fpn}
|
|
\end{table}
|
|
}
|
|
|
|
\paragraph{Motion R-CNN backbone}
|
|
Like Faster R-CNN and Mask R-CNN, we use a ResNet variant \cite{ResNet} as backbone network to compute feature maps from input imagery.
|
|
|
|
Inspired by FlowNetS \cite{FlowNet}, we make one modification to the ResNet backbone to enable image matching,
|
|
laying the foundation for our motion estimation. Instead of taking a single image as input to the backbone,
|
|
we depth-concatenate two temporally consecutive frames $I_t$ and $I_{t+1}$, yielding a input image map with six channels.
|
|
Additionally, we also experiment with concatenating the camera space XYZ coordinates for each frame,
|
|
XYZ$_t$ and XYZ$_{t+1}$, into the input as well.
|
|
We do not introduce a separate network for computing region proposals and use our modified backbone network
|
|
as both RPN and for extracting the RoI features.
|
|
|
|
Technically, our feature encoder network will have to learn image matching representations similar to
|
|
those learned by the FlowNet encoder, but the output will be computed in the
|
|
object-centric framework of a region-based convolutional network head with a 3D parametrization.
|
|
Thus, in contrast to the dense FlowNet decoder, the estimated dense image matching information
|
|
from the encoder is integrated for specific objects via RoI extraction and subsequently
|
|
processed by the RoI head for each object.
|
|
|
|
\paragraph{Per-RoI motion prediction}
|
|
We use a 3D rigid motion parametrization similar to the one used in SE3-Nets and SfM-Net \cite{SE3Nets, SfmNet}.
|
|
For the $k$-th object proposal, we predict the rigid transformation $\{R_k, t_k\}\in \mathbf{SE}(3)$
|
|
\footnote{$\mathbf{SE}(3)$ refers to the Special Euclidean Group representing 3D rotations
|
|
and translations: $\{R, t|R \in \mathbf{SO}(3), t \in \mathbb{R}^3\}$}
|
|
of the object between the two frames $I_t$ and $I_{t+1}$, as well as the object pivot $p_k \in \mathbb{R}^3$ at time $t$.
|
|
We parametrize ${R_k}$ using an Euler angle representation,
|
|
|
|
\begin{equation}
|
|
R_k = R_k^z(\gamma) \cdot R_k^x(\alpha) \cdot R_k^y(\beta),
|
|
\end{equation}
|
|
|
|
where
|
|
\begin{equation}
|
|
R_k^x(\alpha) =
|
|
\begin{pmatrix}
|
|
1 & 0 & 0 \\
|
|
0 & \cos(\alpha) & -\sin(\alpha) \\
|
|
0 & \sin(\alpha) & \cos(\alpha)
|
|
\end{pmatrix},
|
|
\end{equation}
|
|
|
|
\begin{equation}
|
|
R_k^y(\beta) =
|
|
\begin{pmatrix}
|
|
\cos(\beta) & 0 & \sin(\beta) \\
|
|
0 & 1 & 0 \\
|
|
-\sin(\beta) & 0 & \cos(\beta)
|
|
\end{pmatrix},
|
|
\end{equation}
|
|
|
|
\begin{equation}
|
|
R_k^z(\gamma) =
|
|
\begin{pmatrix}
|
|
\cos(\gamma) & -\sin(\gamma) & 0 \\
|
|
\sin(\gamma) & \cos(\gamma) & 0 \\
|
|
0 & 0 & 1
|
|
\end{pmatrix},
|
|
\end{equation}
|
|
|
|
and $\alpha, \beta, \gamma$ are the rotation angles in radians about the $x,y,z$-axis, respectively.
|
|
|
|
We then extend the Mask R-CNN head by adding a small fully-connected network for motion
|
|
prediction in addition to the fully-connected layers for
|
|
refined boxes and classes and the convolutional network for the masks.
|
|
Like for refined boxes and masks, we make one separate motion prediction for each class.
|
|
Each instance motion is predicted as a set of nine values,
|
|
$\sin(\alpha)$, $\sin(\beta)$, $\sin(\gamma)$, $t_k$ and $p_k$,
|
|
where $\sin(\alpha)$, $\sin(\beta)$ and $\sin(\gamma)$ are clipped to $[-1, 1]$.
|
|
Here, we assume that motions between frames are relatively small
|
|
and that objects rotate at most 90 degrees in either direction along any axis,
|
|
which is in general a safe assumption for image sequences from videos,
|
|
and enables us to obtain unique cosine values from the predicted sine values.
|
|
All predictions are made in camera space, and translation and pivot predictions are in meters.
|
|
We additionally predict softmax scores $o_k$ for classifying the objects into
|
|
still and moving objects. As a postprocessing, for any object instance $k$ with predicted moving flag $o_k = 0$,
|
|
we set $\sin(\alpha) = \sin(\beta) = \sin(\gamma) = 0$ and $t_k = (0,0,0)^T$,
|
|
and thus predict an identity motion.
|
|
|
|
|
|
\paragraph{Camera motion prediction}
|
|
In addition to the object transformations, we optionally predict the camera motion $\{R_{cam}, t_{cam}\}\in \mathbf{SE}(3)$
|
|
between the two frames $I_t$ and $I_{t+1}$.
|
|
For this, we branch off a small fully-connected network from the bottleneck output of the backbone.
|
|
We again represent $R_{cam}$ using a Euler angle representation and
|
|
predict $\sin(\alpha)$, $\sin(\beta)$, $\sin(\gamma)$ and $t_{cam}$ in the same way as for the individual objects.
|
|
Again, we predict a softmax score $o_{cam}$ for differentiating between
|
|
a still and moving camera.
|
|
|
|
\subsection{Network design}
|
|
|
|
\label{ssec:design}
|
|
\paragraph{Camera motion network}
|
|
In our ResNet variant without FPN (Table \ref{table:motionrcnn_resnet}), the underlying
|
|
ResNet backbone is only computed up to the $C_4$ block, as otherwise the
|
|
feature resolution prior to RoI extraction would be reduced too much.
|
|
Therefore, in our variant without FPN, we first pass the $C_4$ features through $C_5$
|
|
and $C_6$ blocks (with weights independent from the $C_5$ block used in the RoI head in this variant)
|
|
to increase the bottleneck stride prior to the camera motion network to 64.
|
|
In our ResNet-FPN variant (Table \ref{table:motionrcnn_resnet_fpn}),
|
|
the backbone makes use of all blocks through $C_6$, and
|
|
we can simply branch off our camera motion network from the $C_6$ bottleneck.
|
|
Then, in both, the ResNet and ResNet-FPN variant, we apply one additional
|
|
convolution to the $C_6$ features to reduce the number of inputs to the following
|
|
fully-connected layers, and thus keep the number of weights reasonably small.
|
|
Instead of averaging, we use bilinear resizing to bring the convolutional features
|
|
to a fixed size without losing all spatial information,
|
|
flatten them, and finally apply multiple fully-connected layers to predict the
|
|
camera motion parameters.
|
|
|
|
\paragraph{RoI motion head network}
|
|
In both of our network variants
|
|
(Tables \ref{table:motionrcnn_resnet} and \ref{table:motionrcnn_resnet_fpn}),
|
|
we compute the fully-connected network for motion prediction from the
|
|
flattened RoI features, which are also the basis for classification and
|
|
bounding box refinement.
|
|
Note that the features (extracted from the upsampled FPN stage appropriate to the RoI bounding box scales)
|
|
passed to our ResNet-FPN RoI head went through the $C_6$
|
|
bottleneck, which has a stride of 64 with respect to the original image.
|
|
In contrast, the bottleneck for the features passed to our ResNet RoI head
|
|
is $C_4$ (with a stride of 16). Thus, the ResNet-FPN variant can in principle estimate
|
|
object motions based on larger displacements than the ResNet variant.
|
|
Additionally, as smaller bounding boxes use higher resolution features, the
|
|
motions and pivots of (especially smaller) objects can in principle be more accurately
|
|
estimated with the FPN variant.
|
|
|
|
\subsection{Supervision}
|
|
\label{ssec:supervision}
|
|
|
|
\paragraph{Per-RoI instance motion supervision with 3D instance motion ground truth}
|
|
The most straightforward way to supervise the object motions is by using ground truth
|
|
motions computed from ground truth object poses, which is in general
|
|
only practical when training on synthetic datasets.
|
|
Given the $k$-th foreground RoI (as defined for Mask R-CNN) with ground class $c_k^*$,
|
|
let $R_k, t_k, p_k, o_k$ be the predicted motion for class $c_k^*$ as parametrized above,
|
|
and $R_k^*, t_k^*, p_k^*, o_k^*$ the ground truth motion for the matched ground truth example.
|
|
Similar to the camera pose regression loss in \cite{PoseNet2},
|
|
we use a variant of the $\ell_1$-loss to penalize the differences between ground truth and predicted
|
|
rotation, translation (and pivot, in our case). We found that the smooth $\ell_1$-loss
|
|
performs better in our case than the standard $\ell_1$-loss.
|
|
We thus compute the RoI motion loss as
|
|
|
|
\begin{equation}
|
|
L_{motion} = \frac{1}{\text{N}_{RoI}^{\mathit{fg}}} \sum_k^{\text{N}_{RoI}} (l_{R}^k + l_{t}^k) \cdot o_k^* + l_{p}^k + l_o^k,
|
|
\end{equation}
|
|
where
|
|
\begin{equation}
|
|
l_{R}^k = \ell_{reg} (R_k^* - R_k),
|
|
\end{equation}
|
|
\begin{equation}
|
|
l_{t}^k = \ell_{reg} (t_k^* - t_k),
|
|
\end{equation}
|
|
and
|
|
\begin{equation}
|
|
l_{p}^k = \ell_{reg} (p_k^* - p_k)
|
|
\end{equation}
|
|
are the smooth-$\ell_1$ losses for the predicted rotation, translation and pivot,
|
|
respectively and
|
|
\begin{equation}
|
|
l_o^k = \ell_{cls}(o_k, o_k^*)
|
|
\end{equation}
|
|
is the (categorical) cross-entropy loss for the predicted classification into moving and non-moving objects.
|
|
|
|
Note that we do not penalize the rotation and translation for objects with
|
|
$o_k^* = 0$, which do not move between $t$ and $t+1$. We found that the network
|
|
may not reliably predict exact identity motions for still objects, which is
|
|
numerically more difficult to optimize than performing classification between
|
|
moving and non-moving objects and discarding the regression for the non-moving
|
|
ones. Also, analogously to masks and bounding boxes, the estimates for classes
|
|
other than $c_k^*$ are not penalized.
|
|
|
|
Now, our modified RoI loss is
|
|
\begin{equation}
|
|
L_{RoI} = L_{cls} + L_{box} + L_{mask} + L_{motion}.
|
|
\end{equation}
|
|
|
|
\paragraph{Camera motion supervision}
|
|
We supervise the camera motion with ground truth analogously to the
|
|
object motions, with the only difference being that we only have
|
|
a rotation and translation, but no pivot loss for the camera motion.
|
|
If the ground truth shows that the camera is not moving, we again do not
|
|
penalize rotation and translation. In this case, the camera motion loss is reduced to the
|
|
classification loss.
|
|
|
|
\paragraph{Per-RoI instance motion supervision \emph{without} 3D instance motion ground truth}
|
|
\begin{figure}[t]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/flow_loss}
|
|
\caption{
|
|
Overview of the alternative, flow-based loss for instance motion
|
|
supervision without 3D instance motion ground truth.
|
|
In contrast to SfM-Net \cite{SfmNet}, where a single optical flow field is
|
|
composed and penalized to supervise the motion prediction, our loss considers
|
|
the motion of all objects in isolation and composes a batch of flow windows
|
|
for all RoIs. Network predictions are shown in red.
|
|
}
|
|
\label{figure:flow_loss}
|
|
\end{figure}
|
|
|
|
A more general way to supervise the object motions is a re-projection
|
|
loss similar to the unsupervised loss in SfM-Net \cite{SfmNet},
|
|
which we can apply to coordinates within the object bounding boxes,
|
|
and which does not require ground truth 3D object motions.
|
|
|
|
In this case, for any RoI,
|
|
we generate a uniform $m \times m$ grid of 2D points inside the RPN proposal bounding box
|
|
with the same resolution as the predicted mask.
|
|
Note that the predicted mask we use here was binarized at a threshold of $0.5$.
|
|
We use the same bounding box
|
|
to crop the corresponding region from the dense, full-image depth map
|
|
and bilinearly resize the depth crop to the same resolution as the mask and point
|
|
grid.
|
|
Next, we create a grid of 3D points (point cloud) from the grid of 2D points and depth crop. To this point cloud, we
|
|
apply the object motion predicted for the RoI, masked by the predicted mask.
|
|
Then, we apply the camera motion to the 3D points, project them back to 2D
|
|
and finally compute the optical flow at each point as the difference of the initial and re-projected 2D grids.
|
|
Note that we batch this computation over all RoIs, so that we only perform
|
|
it once per forward pass.
|
|
Figure \ref{figure:flow_loss} illustrates the approach.
|
|
|
|
The mathematical details for the 3D transformations and mappings between 2D and 3D are analogous to the
|
|
dense, full-image flow composition in the following subsection, so we will not
|
|
duplicate them here. The only differences are that there is no sum over objects during
|
|
the point transformation based on instance motion, as we consider the single object
|
|
corresponding to an RoI in isolation, and that the masks are not resized to the
|
|
full image resolution, as
|
|
the depth crop and the grid of 2D points are at the same resolution as the predicted
|
|
$m \times m$ mask.
|
|
|
|
For each RoI, we can now compute $L_{RoI}$ and thus supervise the object motion
|
|
by penalizing the $m \times m$ optical flow grid.
|
|
If there is optical flow ground truth available, we can use the RoI bounding box to
|
|
crop and resize a region from the ground truth optical flow to match the RoI's
|
|
optical flow grid and penalize the difference between the flow grids with a (smooth) $\ell_1$-loss.
|
|
|
|
However, we could also use the re-projection loss without optical flow ground truth
|
|
to train the motion prediction in an unsupervised manner, similar to \cite{SfmNet}.
|
|
In this case, we could use the bounding box to crop and bilinearly resize the corresponding region
|
|
from the first image $I_t$ and bilinearly sample the corresponding region from the second image $I_{t+1}$,
|
|
using the 2D point grid displaced with the predicted flow grid (which is often called \emph{backward warping}).
|
|
Then, we could penalize the difference
|
|
between the resulting image crops, for example, with a census loss \cite{CensusTerm,UnFlow}.
|
|
For more details on differentiable bilinear sampling for deep learning, we refer the reader to
|
|
\cite{STN}.
|
|
|
|
When compared to supervision with 3D instance motion ground truth, a re-projection
|
|
loss could benefit motion regression by removing any loss balancing issues between the
|
|
rotation, translation and pivot losses \cite{PoseNet2},
|
|
which could make it interesting even when 3D motion ground truth is available.
|
|
|
|
\subsection{Training and inference}
|
|
\label{ssec:training_inference}
|
|
\paragraph{Training}
|
|
We train the Motion R-CNN RPN and RoI heads in the exact same way as described for Mask R-CNN.
|
|
We additionally compute the camera and instance motion losses and concatenate the additional
|
|
frame (and, optionally, XYZ coordinates) into the network input, but otherwise do not modify the training procedure
|
|
and sample proposals and RoIs in the exact same way.
|
|
|
|
\paragraph{Inference}
|
|
During inference, we proceed analogously to Mask R-CNN.
|
|
In the same way as the RoI mask head, at test time, we compute the RoI motion head
|
|
from the features extracted with refined bounding boxes.
|
|
|
|
Again, as for masks and bounding boxes in Mask R-CNN,
|
|
the predicted output object motion is the predicted object motion for the
|
|
highest scoring class.
|
|
|
|
\subsection{Dense flow from 3D motion}
|
|
\label{ssec:postprocessing}
|
|
As a postprocessing, we compose the dense optical flow between $I_t$ and $I_{t+1}$ from the outputs of our Motion R-CNN network.
|
|
Given the depth map $d_t$ for frame $I_t$, we first create a 3D point cloud in camera space at time $t$,
|
|
where
|
|
\begin{equation}
|
|
P_t =
|
|
\begin{pmatrix}
|
|
X_t \\ Y_t \\ Z_t
|
|
\end{pmatrix}
|
|
=
|
|
\frac{d_t}{f}
|
|
\begin{pmatrix}
|
|
x_t - c_0 \\ y_t - c_1 \\ f
|
|
\end{pmatrix},
|
|
\end{equation}
|
|
is the 3D coordinate at $t$ corresponding to the point with pixel coordinates $x_t, y_t$,
|
|
which range over all coordinates in $I_t$,
|
|
and $(c_0, c_1, f)$ are the camera intrinsics.
|
|
For now, the depth map is always assumed to come from ground truth.
|
|
|
|
Given $k$ detections with predicted motions as above, we transform all points within the bounding
|
|
box and mask of a detected object according to the predicted motion of the object.
|
|
|
|
For this, we first define the \emph{full image} mask $M_k$ for object k,
|
|
which can be computed from the predicted box mask $m_k$ (for the predicted class) by bilinearly resizing
|
|
it to the width and height of the predicted bounding box and then copying the values
|
|
of the resized mask into a full (image) resolution mask initialized with zeros,
|
|
starting at the top-left coordinate of the predicted bounding box.
|
|
Again we binarize masks at a threshold of $0.5$.
|
|
|
|
Then, given the predicted motions $(R_k, t_k)$ and pivots $p_k$ for all objects,
|
|
\begin{equation}
|
|
P'_{t+1} =
|
|
P_t + \sum_1^{\text{N}} M_k\left\{ R_k \cdot (P_t - p_k) + p_k + t_k - P_t \right\},
|
|
\end{equation}
|
|
where N is the number of detections.
|
|
The motion predictions are understood to have already taken into account
|
|
the classification into moving and still objects,
|
|
and we thus have, as described above, identity motions for all objects with $o_k = 0$.
|
|
|
|
Next, we transform points given the camera transformation $\{R_{cam}, t_{cam}\} \in \mathbf{SE}(3)$,
|
|
|
|
\begin{equation}
|
|
\begin{pmatrix}
|
|
X_{t+1} \\ Y_{t+1} \\ Z_{t+1}
|
|
\end{pmatrix}
|
|
= P_{t+1} = R_{cam} \cdot P'_{t+1} + t_{cam}.
|
|
\end{equation}
|
|
|
|
%Note that in our experiments, we either use the ground truth camera motion to focus
|
|
%on evaluating the object motion predictions or the predicted camera motion to evaluate
|
|
%the complete motion estimates. We will always state which variant we use in the experimental section.
|
|
|
|
Finally, we project the transformed 3D points at time $t+1$ to 2D pixel coordinates again,
|
|
\begin{equation}
|
|
\begin{pmatrix}
|
|
x_{t+1} \\ y_{t+1}
|
|
\end{pmatrix}
|
|
=
|
|
\frac{f}{Z_{t+1}}
|
|
\begin{pmatrix}
|
|
X_{t+1} \\ Y_{t+1}
|
|
\end{pmatrix}
|
|
+
|
|
\begin{pmatrix}
|
|
c_0 \\ c_1
|
|
\end{pmatrix}.
|
|
\end{equation}
|
|
We now obtain the optical flow between $I_t$ and $I_{t+1}$ at each point as
|
|
\begin{equation}
|
|
\begin{pmatrix}
|
|
u \\ v
|
|
\end{pmatrix}
|
|
=
|
|
\begin{pmatrix}
|
|
x_{t+1} - x_{t} \\ y_{t+1} - y_{t}
|
|
\end{pmatrix}.
|
|
\end{equation}
|