bsc-thesis/approach.tex


\subsection{Motion R-CNN}
\label{ssec:model}

Building on Mask R-CNN \cite{MaskRCNN},
we estimate per-object motion by predicting the 3D motion of each detected object.
For this, we extend Mask R-CNN in two straightforward ways.
First, we modify the backbone network and provide it with two frames
in order to enable image matching between the consecutive frames.
Second, we extend the Mask R-CNN RoI head to predict a 3D motion and pivot for each
region proposal. Tables \ref{table:motionrcnn_resnet} and \ref{table:motionrcnn_resnet_fpn}
show our Motion R-CNN networks based on Mask R-CNN ResNet and Mask R-CNN ResNet-FPN,
respectively.

{\begin{table}[h]
\centering
\begin{tabular}{llr}
\toprule
\textbf{Output} & \textbf{Layer Operations} & \textbf{Output Dimensions} \\
\midrule\midrule
& input images $I_t$, $I_{t+1}$, and (optional) XYZ$_{t}$, XYZ$_{t+1}$ & H $\times$ W $\times$ C \\
\midrule
C$_4$ & ResNet \{up to C$_4$\} (Table \ref{table:resnet}) & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 1024 \\
\midrule
\multicolumn{3}{c}{\textbf{Region Proposal Network (RPN)} (Table \ref{table:maskrcnn_resnet})}\\
\midrule
\multicolumn{3}{c}{\textbf{Camera Motion Network}}\\
\midrule
& From C$_4$: ResNet \{C$_5$\} (Table \ref{table:resnet}) & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 2048 \\
& ResNet \{C$_6$\} (Table \ref{table:resnet}) & $\tfrac{1}{64}$ H $\times$ $\tfrac{1}{64}$ W $\times$ 2048 \\
& bilinear resize, 7 $\times$ 7 & 7 $\times$ 7 $\times$ 512 \\
& flatten & 1 $\times$ 7 $\cdot$ 7 $\cdot$ 512 \\
T$_0$ & $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2  & 1 $\times$ 1024 \\

$R_{cam}$& From T$_0$: fully connected, 3 & 1 $\times$ 3 \\
$t_{cam}$& From T$_0$: fully connected, 3 & 1 $\times$ 3 \\
& From T$_0$: fully connected, 2 & 1 $\times$ 2 \\
$o_{cam}$& softmax, 2 & 1 $\times$ 2 \\
\midrule
\multicolumn{3}{c}{\textbf{RoI Head \& RoI Head: Masks} (Table \ref{table:maskrcnn_resnet})}\\
\midrule
\multicolumn{3}{c}{\textbf{RoI Head: Motions}}\\
\midrule
%& From M$_0$: flatten & N$_{RoI}$ $\times$ 7 $\cdot$ 7 $\cdot$ 256 \\
T$_1$ & From ave: $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & N$_{RoI}$ $\times$ 1024 \\
$\forall k: R_k$ & From T$_1$: fully connected, 3 & N$_{RoI}$ $\times$ 3 \\
$\forall k: t_k$ & From T$_1$: fully connected, 3 & N$_{RoI}$ $\times$ 3 \\
$\forall k: p_k$ & From T$_1$: fully connected, 3 & N$_{RoI}$ $\times$ 3 \\
& From T$_1$: fully connected, 2 & N$_{RoI}$ $\times$ 2 \\
$\forall k: o_k$ & softmax, 2 & N$_{RoI}$ $\times$ 2 \\

\bottomrule
\end{tabular}

\caption {
Motion R-CNN ResNet architecture based on the Mask R-CNN
ResNet architecture (Table \ref{table:maskrcnn_resnet}).
We use ReLU activations after all hidden layers and
additonally dropout with $p = 0.5$ after all fully-connected hidden layers.
}
\label{table:motionrcnn_resnet}
\end{table}
}

{
\begin{table}[h]
\centering
\begin{tabular}{llr}
\toprule
\textbf{Output} & \textbf{Layer Operations} & \textbf{Output Dimensions} \\
\midrule\midrule
& input images $I_t$, $I_{t+1}$, and (optional) XYZ$_{t}$, XYZ$_{t+1}$  & H $\times$ W $\times$ C \\
\midrule
C$_6$ & ResNet (Table \ref{table:resnet}) & $\tfrac{1}{64}$ H $\times$ $\tfrac{1}{64}$ W $\times$ 2048 \\
\midrule
\multicolumn{3}{c}{\textbf{RPN \& FPN} (Table \ref{table:maskrcnn_resnet_fpn})} \\
\midrule
\multicolumn{3}{c}{\textbf{Camera Motion Network}}\\
\midrule
& From C$_6$: 1 $\times$ 1 conv, 512 & $\tfrac{1}{64}$ H $\times$ $\tfrac{1}{64}$ W $\times$ 512 \\
& bilinear resize, 7 $\times$ 7 & 7 $\times$ 7 $\times$ 512 \\
& flatten & 1 $\times$ 7 $\cdot$ 7 $\cdot$ 512 \\
T$_2$ & $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2  & 1 $\times$ 1024 \\
$R_{cam}$& From T$_2$: fully connected, 3 & 1 $\times$ 3 \\
$t_{cam}$& From T$_2$: fully connected, 3 & 1 $\times$ 3 \\
& From T$_2$: fully connected, 2 & 1 $\times$ 2 \\
$o_{cam}$& softmax, 2 & 1 $\times$ 2 \\
\midrule
\multicolumn{3}{c}{\textbf{RoI Head \& RoI Head: Masks} (Table \ref{table:maskrcnn_resnet_fpn})} \\
\midrule
\multicolumn{3}{c}{\textbf{RoI Head: Motions}}\\
\midrule
%& From M$_1$: flatten & N$_{RoI}$ $\times$ 14 $\cdot$ 14 $\cdot$ 256 \\
T$_3$ & From F$_1$: $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & N$_{RoI}$ $\times$ 1024 \\
$\forall k: R_k$ & From T$_3$: fully connected, 3 & N$_{RoI}$ $\times$ 3 \\
$\forall k: t_k$ & From T$_3$: fully connected, 3 & N$_{RoI}$ $\times$ 3 \\
$\forall k: p_k$ & From T$_3$: fully connected, 3 & N$_{RoI}$ $\times$ 3 \\
& From T$_2$: fully connected, 2 & N$_{RoI}$ $\times$ 2 \\
$\forall k: o_k$ & softmax, 2 & N$_{RoI}$ $\times$ 2 \\

\bottomrule
\end{tabular}

\caption {
Motion R-CNN ResNet-FPN architecture based on the Mask R-CNN
ResNet-FPN architecture (Table \ref{table:maskrcnn_resnet_fpn}).
To obtain a larger bottleneck stride, we compute the feature pyramid starting
with C$_6$ instead of C$_5$, and thus, the subsampling from P$_5$ to P$_6$ is omitted.
The modifications are analogous to our Motion R-CNN ResNet,
but we still show the architecture for completeness.
Again, we use ReLU activations after all hidden layers and
additonally dropout with $p = 0.5$ after all fully-connected hidden layers.
}
\label{table:motionrcnn_resnet_fpn}
\end{table}
}

\paragraph{Motion R-CNN backbone}
Like Faster R-CNN and Mask R-CNN, we use a ResNet variant \cite{ResNet} as backbone network to compute feature maps from input imagery.

Inspired by FlowNetS \cite{FlowNet}, we make one modification to the ResNet backbone to enable image matching,
laying the foundation for our motion estimation. Instead of taking a single image as input to the backbone,
we depth-concatenate two temporally consecutive frames $I_t$ and $I_{t+1}$, yielding a input image map with six channels.
Additionally, we also experiment with concatenating the camera space XYZ coordinates for each frame,
XYZ$_t$ and XYZ$_{t+1}$, into the input as well.
We do not introduce a separate network for computing region proposals and use our modified backbone network
as both RPN and for extracting the RoI features.

Technically, our feature encoder network will have to learn image matching representations similar to
those learned by the FlowNet encoder, but the output will be computed in the
object-centric framework of a region-based convolutional network head with a 3D parametrization.
Thus, in contrast to the dense FlowNet decoder, the estimated dense image matching information
from the encoder is integrated for specific objects via RoI extraction and subsequently
processed by the RoI head for each object.

\paragraph{Per-RoI motion prediction}
We use a 3D rigid motion parametrization similar to the one used in SE3-Nets and SfM-Net \cite{SE3Nets, SfmNet}.
For the $k$-th object proposal, we predict the rigid transformation $\{R_k, t_k\}\in \mathbf{SE}(3)$
\footnote{$\mathbf{SE}(3)$ refers to the Special Euclidean Group representing 3D rotations
and translations: $\{R, t|R \in \mathbf{SO}(3), t \in \mathbb{R}^3\}$}
of the object between the two frames $I_t$ and $I_{t+1}$, as well as the object pivot $p_k \in \mathbb{R}^3$ at time $t$.
We parametrize ${R_k}$ using an Euler angle representation,

\begin{equation}
R_k = R_k^z(\gamma) \cdot R_k^x(\alpha) \cdot R_k^y(\beta),
\end{equation}

where
\begin{equation}
R_k^x(\alpha) =
\begin{pmatrix}
  1 & 0 & 0 \\
  0 & \cos(\alpha) & -\sin(\alpha) \\
  0 & \sin(\alpha) & \cos(\alpha)
\end{pmatrix},
\end{equation}

\begin{equation}
R_k^y(\beta) =
\begin{pmatrix}
  \cos(\beta) & 0 & \sin(\beta) \\
  0 & 1 & 0 \\
  -\sin(\beta) & 0 & \cos(\beta)
\end{pmatrix},
\end{equation}

\begin{equation}
R_k^z(\gamma) =
\begin{pmatrix}
  \cos(\gamma) & -\sin(\gamma) & 0 \\
  \sin(\gamma) & \cos(\gamma) & 0 \\
  0 & 0 & 1
\end{pmatrix},
\end{equation}

and $\alpha, \beta, \gamma$ are the rotation angles in radians about the $x,y,z$-axis, respectively.

We then extend the Mask R-CNN head by adding a small fully-connected network for motion
prediction in addition to the fully-connected layers for
refined boxes and classes and the convolutional network for the masks.
Like for refined boxes and masks, we make one separate motion prediction for each class.
Each instance motion is predicted as a set of nine values,
$\sin(\alpha)$, $\sin(\beta)$, $\sin(\gamma)$, $t_k$ and $p_k$,
where $\sin(\alpha)$, $\sin(\beta)$ and $\sin(\gamma)$ are clipped to $[-1, 1]$.
Here, we assume that motions between frames are relatively small
and that objects rotate at most 90 degrees in either direction along any axis,
which is in general a safe assumption for image sequences from videos,
and enables us to obtain unique cosine values from the predicted sine values.
All predictions are made in camera space, and translation and pivot predictions are in meters.
We additionally predict softmax scores $o_k$ for classifying the objects into
still and moving objects. As a postprocessing, for any object instance $k$ with predicted moving flag $o_k = 0$,
we set $\sin(\alpha) = \sin(\beta) = \sin(\gamma) = 0$ and $t_k = (0,0,0)^T$,
and thus predict an identity motion.


\paragraph{Camera motion prediction}
In addition to the object transformations, we optionally predict the camera motion $\{R_{cam}, t_{cam}\}\in \mathbf{SE}(3)$
between the two frames $I_t$ and $I_{t+1}$.
For this, we branch off a small fully-connected network from the bottleneck output of the backbone.
We again represent $R_{cam}$ using a Euler angle representation and
predict $\sin(\alpha)$, $\sin(\beta)$, $\sin(\gamma)$ and $t_{cam}$ in the same way as for the individual objects.
Again, we predict a softmax score $o_{cam}$ for differentiating between
a still and moving camera.

\subsection{Network design}

\label{ssec:design}
\paragraph{Camera motion network}
In our ResNet variant without FPN (Table \ref{table:motionrcnn_resnet}), the underlying
ResNet backbone is only computed up to the $C_4$ block, as otherwise the
feature resolution prior to RoI extraction would be reduced too much.
Therefore, in our variant without FPN, we first pass the $C_4$ features through $C_5$
and $C_6$ blocks (with weights independent from the $C_5$ block used in the RoI head in this variant)
to increase the bottleneck stride prior to the camera motion network to 64.
In our ResNet-FPN variant (Table \ref{table:motionrcnn_resnet_fpn}),
the backbone makes use of all blocks through $C_6$, and
we can simply branch off our camera motion network from the $C_6$ bottleneck.
Then, in both, the ResNet and ResNet-FPN variant, we apply one additional
convolution to the $C_6$ features to reduce the number of inputs to the following
fully-connected layers, and thus keep the number of weights reasonably small.
Instead of averaging, we use bilinear resizing to bring the convolutional features
to a fixed size without losing all spatial information,
flatten them, and finally apply multiple fully-connected layers to predict the
camera motion parameters.

\paragraph{RoI motion head network}
In both of our network variants
(Tables \ref{table:motionrcnn_resnet} and \ref{table:motionrcnn_resnet_fpn}),
we compute the fully-connected network for motion prediction from the
flattened RoI features, which are also the basis for classification and
bounding box refinement.
Note that the features (extracted from the upsampled FPN stage appropriate to the RoI bounding box scales)
passed to our ResNet-FPN RoI head went through the $C_6$
bottleneck, which has a stride of 64 with respect to the original image.
In contrast, the bottleneck for the features passed to our ResNet RoI head
is $C_4$ (with a stride of 16). Thus, the ResNet-FPN variant can in principle estimate
object motions based on larger displacements than the ResNet variant.
Additionally, as smaller bounding boxes use higher resolution features, the
motions and pivots of (especially smaller) objects can in principle be more accurately
estimated with the FPN variant.

\subsection{Supervision}
\label{ssec:supervision}

\paragraph{Per-RoI instance motion supervision with 3D instance motion ground truth}
The most straightforward way to supervise the object motions is by using ground truth
motions computed from ground truth object poses, which is in general
only practical when training on synthetic datasets.
Given the $k$-th foreground RoI (as defined for Mask R-CNN) with ground class $c_k^*$,
let $R_k, t_k, p_k, o_k$ be the predicted motion for class $c_k^*$ as parametrized above,
and $R_k^*, t_k^*, p_k^*, o_k^*$ the ground truth motion for the matched ground truth example.
Similar to the camera pose regression loss in \cite{PoseNet2},
we use a variant of the $\ell_1$-loss to penalize the differences between ground truth and predicted
rotation, translation (and pivot, in our case). We found that the smooth $\ell_1$-loss
performs better in our case than the standard $\ell_1$-loss.
We thus compute the RoI motion loss as

\begin{equation}
L_{motion} = \frac{1}{\text{N}_{RoI}^{\mathit{fg}}} \sum_k^{\text{N}_{RoI}} (l_{R}^k + l_{t}^k) \cdot o_k^* + l_{p}^k + l_o^k,
\end{equation}
where
\begin{equation}
l_{R}^k = \ell_{reg} (R_k^* - R_k),
\end{equation}
\begin{equation}
l_{t}^k = \ell_{reg} (t_k^* - t_k),
\end{equation}
and
\begin{equation}
l_{p}^k = \ell_{reg} (p_k^* - p_k)
\end{equation}
are the smooth-$\ell_1$ losses for the predicted rotation, translation and pivot,
respectively and
\begin{equation}
l_o^k = \ell_{cls}(o_k, o_k^*)
\end{equation}
is the (categorical) cross-entropy loss for the predicted classification into moving and non-moving objects.

Note that we do not penalize the rotation and translation for objects with
$o_k^* = 0$, which do not move between $t$ and $t+1$. We found that the network
may not reliably predict exact identity motions for still objects, which is
numerically more difficult to optimize than performing classification between
moving and non-moving objects and discarding the regression for the non-moving
ones. Also, analogously to masks and bounding boxes, the estimates for classes
other than $c_k^*$ are not penalized.

Now, our modified RoI loss is
\begin{equation}
L_{RoI} = L_{cls} + L_{box} + L_{mask} + L_{motion}.
\end{equation}

\paragraph{Camera motion supervision}
We supervise the camera motion with ground truth analogously to the
object motions, with the only difference being that we only have
a rotation and translation, but no pivot loss for the camera motion.
If the ground truth shows that the camera is not moving, we again do not
penalize rotation and translation. In this case, the camera motion loss is reduced to the
classification loss.

\paragraph{Per-RoI instance motion supervision \emph{without} 3D instance motion ground truth}
\begin{figure}[t]
  \centering
  \includegraphics[width=\textwidth]{figures/flow_loss}
\caption{
Overview of the alternative, flow-based loss for instance motion
supervision without 3D instance motion ground truth.
In contrast to SfM-Net \cite{SfmNet}, where a single optical flow field is
composed and penalized to supervise the motion prediction, our loss considers
the motion of all objects in isolation and composes a batch of flow windows
for all RoIs. Network predictions are shown in red.
}
\label{figure:flow_loss}
\end{figure}

A more general way to supervise the object motions is a re-projection
loss similar to the unsupervised loss in SfM-Net \cite{SfmNet},
which we can apply to coordinates within the object bounding boxes,
and which does not require ground truth 3D object motions.

In this case, for any RoI,
we generate a uniform $m \times m$ grid of 2D points inside the RPN proposal bounding box
with the same resolution as the predicted mask.
Note that the predicted mask we use here was binarized at a threshold of $0.5$.
We use the same bounding box
to crop the corresponding region from the dense, full-image depth map
and bilinearly resize the depth crop to the same resolution as the mask and point
grid.
Next, we create a grid of 3D points (point cloud) from the grid of 2D points and depth crop. To this point cloud, we
apply the object motion predicted for the RoI, masked by the predicted mask.
Then, we apply the camera motion to the 3D points, project them back to 2D
and finally compute the optical flow at each point as the difference of the initial and re-projected 2D grids.
Note that we batch this computation over all RoIs, so that we only perform
it once per forward pass.
Figure \ref{figure:flow_loss} illustrates the approach.

The mathematical details for the 3D transformations and mappings between 2D and 3D are analogous to the
dense, full-image flow composition in the following subsection, so we will not
duplicate them here. The only differences are that there is no sum over objects during
the point transformation based on instance motion, as we consider the single object
corresponding to an RoI in isolation, and that the masks are not resized to the
full image resolution, as
the depth crop and the grid of 2D points are at the same resolution as the predicted
$m \times m$ mask.

For each RoI, we can now compute $L_{RoI}$ and thus supervise the object motion
by penalizing the $m \times m$ optical flow grid.
If there is optical flow ground truth available, we can use the RoI bounding box to
crop and resize a region from the ground truth optical flow to match the RoI's
optical flow grid and penalize the difference between the flow grids with a (smooth) $\ell_1$-loss.

However, we could also use the re-projection loss without optical flow ground truth
to train the motion prediction in an unsupervised manner, similar to \cite{SfmNet}.
In this case, we could use the bounding box to crop and bilinearly resize the corresponding region
from the first image $I_t$ and bilinearly sample the corresponding region from the second image $I_{t+1}$,
using the 2D point grid displaced with the predicted flow grid (which is often called \emph{backward warping}).
Then, we could penalize the difference
between the resulting image crops, for example, with a census loss \cite{CensusTerm,UnFlow}.
For more details on differentiable bilinear sampling for deep learning, we refer the reader to
\cite{STN}.

When compared to supervision with 3D instance motion ground truth, a re-projection
loss could benefit motion regression by removing any loss balancing issues between the
rotation, translation and pivot losses \cite{PoseNet2},
which could make it interesting even when 3D motion ground truth is available.

\subsection{Training and inference}
\label{ssec:training_inference}
\paragraph{Training}
We train the Motion R-CNN RPN and RoI heads in the exact same way as described for Mask R-CNN.
We additionally compute the camera and instance motion losses and concatenate the additional
frame (and, optionally, XYZ coordinates) into the network input, but otherwise do not modify the training procedure
and sample proposals and RoIs in the exact same way.

\paragraph{Inference}
During inference, we proceed analogously to Mask R-CNN.
In the same way as the RoI mask head, at test time, we compute the RoI motion head
from the features extracted with refined bounding boxes.

Again, as for masks and bounding boxes in Mask R-CNN,
the predicted output object motion is the predicted object motion for the
highest scoring class.

\subsection{Dense flow from 3D motion}
\label{ssec:postprocessing}
As a postprocessing, we compose the dense optical flow between $I_t$ and $I_{t+1}$ from the outputs of our Motion R-CNN network.
Given the depth map $d_t$ for frame $I_t$, we first create a 3D point cloud in camera space at time $t$,
where
\begin{equation}
P_t =
\begin{pmatrix}
X_t \\ Y_t \\ Z_t
\end{pmatrix}
=
\frac{d_t}{f}
\begin{pmatrix}
x_t - c_0 \\ y_t - c_1 \\ f
\end{pmatrix},
\end{equation}
is the 3D coordinate at $t$ corresponding to the point with pixel coordinates $x_t, y_t$,
which range over all coordinates in $I_t$,
and $(c_0, c_1, f)$ are the camera intrinsics.
For now, the depth map is always assumed to come from ground truth.

Given $k$ detections with predicted motions as above, we transform all points within the bounding
box and mask of a detected object according to the predicted motion of the object.

For this, we first define the \emph{full image} mask $M_k$ for object k,
which can be computed from the predicted box mask $m_k$ (for the predicted class) by bilinearly resizing
it to the width and height of the predicted bounding box and then copying the values
of the resized mask into a full (image) resolution mask initialized with zeros,
starting at the top-left coordinate of the predicted bounding box.
Again we binarize masks at a threshold of $0.5$.

Then, given the predicted motions $(R_k, t_k)$ and pivots $p_k$ for all objects,
\begin{equation}
P'_{t+1} =
P_t + \sum_1^{\text{N}} M_k\left\{ R_k \cdot (P_t - p_k) + p_k + t_k - P_t \right\},
\end{equation}
where N is the number of detections.
The motion predictions are understood to have already taken into account
the classification into moving and still objects,
and we thus have, as described above, identity motions for all objects with $o_k = 0$.

Next, we transform points given the camera transformation $\{R_{cam}, t_{cam}\} \in \mathbf{SE}(3)$,

\begin{equation}
\begin{pmatrix}
X_{t+1} \\ Y_{t+1} \\ Z_{t+1}
\end{pmatrix}
= P_{t+1} = R_{cam} \cdot P'_{t+1} + t_{cam}.
\end{equation}

%Note that in our experiments, we either use the ground truth camera motion to focus
%on evaluating the object motion predictions or the predicted camera motion to evaluate
%the complete motion estimates. We will always state which variant we use in the experimental section.

Finally, we project the transformed 3D points at time $t+1$ to 2D pixel coordinates again,
\begin{equation}
\begin{pmatrix}
x_{t+1} \\ y_{t+1}
\end{pmatrix}
=
\frac{f}{Z_{t+1}}
\begin{pmatrix}
X_{t+1} \\ Y_{t+1}
\end{pmatrix}
+
\begin{pmatrix}
c_0 \\ c_1
\end{pmatrix}.
\end{equation}
We now obtain the optical flow between $I_t$ and $I_{t+1}$ at each point as
\begin{equation}
\begin{pmatrix}
u \\ v
\end{pmatrix}
=
\begin{pmatrix}
x_{t+1} - x_{t} \\ y_{t+1} - y_{t}
\end{pmatrix}.
\end{equation}