mirror of
https://github.com/tu-darmstadt-informatik/bsc-thesis.git
synced 2025-12-13 09:55:49 +00:00
WIP
This commit is contained in:
parent
763d508447
commit
5b046e41b5
263
background.tex
263
background.tex
@ -17,28 +17,10 @@ to estimate disparity-based depth, however monocular depth estimation with deep
|
||||
popular \cite{DeeperDepth, UnsupPoseDepth}.
|
||||
In this preliminary work, we will assume per-pixel depth to be given.
|
||||
|
||||
\subsection{CNNs for dense motion estimation}
|
||||
Deep convolutional neural network (CNN) architectures
|
||||
\cite{ImageNetCNN, VGGNet, ResNet}
|
||||
became widely popular through numerous successes in classification and recognition tasks.
|
||||
The general structure of a CNN consists of a convolutional encoder, which
|
||||
learns a spatially compressed, wide (in the number of channels) representation of the input image,
|
||||
and a fully-connected prediction network on top of the encoder.
|
||||
|
||||
The compressed representations learned by CNNs of these categories do not, however, allow
|
||||
for prediction of high-resolution output, as spatial detail is lost through sequential applications
|
||||
of pooling or strides.
|
||||
Thus, networks for dense prediction introduce a convolutional decoder on top of the representation encoder,
|
||||
performing upsampling of the compressed features and resulting in a encoder-decoder pyramid.
|
||||
The most popular deep networks of this kind for end-to-end optical flow prediction
|
||||
are variants of the FlowNet family \cite{FlowNet, FlowNet2},
|
||||
which was recently extended to scene flow estimation \cite{SceneFlowDataset}.
|
||||
Table \ref{table:flownets} shows the classical FlowNetS architecture for optical flow prediction.
|
||||
|
||||
{
|
||||
%\begin{table}[h]
|
||||
%\centering
|
||||
\begin{longtable}{llr}
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\begin{tabular}{llr}
|
||||
\toprule
|
||||
\textbf{Output} & \textbf{Layer Operations} & \textbf{Output Dimensions} \\
|
||||
\midrule\midrule
|
||||
@ -63,18 +45,33 @@ Table \ref{table:flownets} shows the classical FlowNetS architecture for optical
|
||||
\midrule
|
||||
flow & $\times$ 2 bilinear upsample & H $\times$ W $\times$ 2 \\
|
||||
\bottomrule
|
||||
|
||||
\end{tabular}
|
||||
\caption {
|
||||
FlowNetS \cite{FlowNet} architecture. Transpose convolutions (deconvolutions)
|
||||
are used for refinement.
|
||||
}
|
||||
\label{table:flownets}
|
||||
\end{longtable}
|
||||
|
||||
|
||||
%\end{table}
|
||||
\end{table}
|
||||
}
|
||||
|
||||
\subsection{CNNs for dense motion estimation}
|
||||
Deep convolutional neural network (CNN) architectures
|
||||
\cite{ImageNetCNN, VGGNet, ResNet}
|
||||
became widely popular through numerous successes in classification and recognition tasks.
|
||||
The general structure of a CNN consists of a convolutional encoder, which
|
||||
learns a spatially compressed, wide (in the number of channels) representation of the input image,
|
||||
and a fully-connected prediction network on top of the encoder.
|
||||
|
||||
The compressed representations learned by CNNs of these categories do not, however, allow
|
||||
for prediction of high-resolution output, as spatial detail is lost through sequential applications
|
||||
of pooling or strides.
|
||||
Thus, networks for dense prediction introduce a convolutional decoder on top of the representation encoder,
|
||||
performing upsampling of the compressed features and resulting in a encoder-decoder pyramid.
|
||||
The most popular deep networks of this kind for end-to-end optical flow prediction
|
||||
are variants of the FlowNet family \cite{FlowNet, FlowNet2},
|
||||
which was recently extended to scene flow estimation \cite{SceneFlowDataset}.
|
||||
Table \ref{table:flownets} shows the classical FlowNetS architecture for optical flow prediction.
|
||||
|
||||
Note that the network itself is a rather generic autoencoder and is specialized for optical flow only through being trained
|
||||
with supervision from dense optical flow ground truth.
|
||||
Potentially, the same network could also be used for semantic segmentation if
|
||||
@ -93,9 +90,9 @@ are predicted in addition to a depth map, and a unsupervised re-projection loss
|
||||
image brightness differences penalizes the predictions.
|
||||
|
||||
{
|
||||
%\begin{table}[h]
|
||||
%\centering
|
||||
\begin{longtable}{llr}
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\begin{tabular}{llr}
|
||||
\toprule
|
||||
\textbf{Output} & \textbf{Layer Operations} & \textbf{Output Dimensions} \\
|
||||
\midrule\midrule
|
||||
@ -117,6 +114,9 @@ camera motion & From FC: $\times$ 2 & H $\times$ W $\times$ 6 \\
|
||||
depth & 1 $\times$1 conv, 1 & H $\times$ W $\times$ 1 \\
|
||||
\bottomrule
|
||||
|
||||
|
||||
\end{tabular}
|
||||
|
||||
\caption {
|
||||
SfM-Net \cite{SfmNet} architecture. Here, Conv-Deconv is a simple fully convolutional
|
||||
encoder-decoder network, where convolutions and deconvolutions with stride 2 are
|
||||
@ -126,10 +126,7 @@ The Conv-Deconv weights for the structure and motion networks are not shared,
|
||||
and N$_{motions} = 3$.
|
||||
}
|
||||
\label{table:sfmnet}
|
||||
\end{longtable}
|
||||
|
||||
|
||||
%\end{table}
|
||||
\end{table}
|
||||
}
|
||||
|
||||
\subsection{ResNet}
|
||||
@ -158,9 +155,9 @@ to increase the bottleneck stride to 64, following FlowNetS.
|
||||
|
||||
|
||||
{
|
||||
%\begin{table}[h]
|
||||
%\centering
|
||||
\begin{longtable}{llr}
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\begin{tabular}{llr}
|
||||
\toprule
|
||||
\textbf{Output} & \textbf{Layer Operations} & \textbf{Output Dimensions} \\
|
||||
\midrule\midrule
|
||||
@ -213,7 +210,7 @@ $\begin{bmatrix}
|
||||
& $\tfrac{1}{64}$ H $\times$ $\tfrac{1}{64}$ W $\times$ 2048 \\
|
||||
|
||||
\bottomrule
|
||||
|
||||
\end{tabular}
|
||||
\caption {
|
||||
Backbone architecture based on ResNet-50 \cite{ResNet}.
|
||||
Operations enclosed in a []$_b$ block make up a single ResNet \enquote{bottleneck}
|
||||
@ -223,10 +220,7 @@ is only applied to the first block, but not to repeated blocks.
|
||||
Batch normalization \cite{BN} is used after every residual unit.
|
||||
}
|
||||
\label{table:resnet}
|
||||
\end{longtable}
|
||||
|
||||
|
||||
%\end{table}
|
||||
\end{table}
|
||||
}
|
||||
|
||||
\begin{figure}[t]
|
||||
@ -267,6 +261,57 @@ Thus, given region proposals, the per-region computation is reduced to a single
|
||||
speeding up the system by two orders of magnitude at inference time and one order of magnitude
|
||||
at training time.
|
||||
|
||||
{
|
||||
\begin{table}[t]
|
||||
\centering
|
||||
\begin{tabular}{llr}
|
||||
\toprule
|
||||
\textbf{Output} & \textbf{Layer Operations} & \textbf{Output Dimensions} \\
|
||||
\midrule\midrule
|
||||
& input image & H $\times$ W $\times$ C \\
|
||||
\midrule
|
||||
C$_4$ & ResNet \{up to C$_4$\} (Table \ref{table:resnet}) & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 1024 \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{Region Proposal Network (RPN)}}\\
|
||||
\midrule
|
||||
R$_0$ & From C$_4$: 1 $\times$ 1 conv, 512 & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 512 \\
|
||||
& 1 $\times$ 1 conv, 4 & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ $N_a \cdot$ 4 \\
|
||||
& flatten & A $\times$ 4 \\
|
||||
boxes$_{\mathrm{RPN}}$ & decode bounding boxes (Eq. \ref{eq:pred_bounding_box}) & A $\times$ 4\\
|
||||
& From R$_0$: 1 $\times$ 1 conv, 2 & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ $N_a \cdot$ 2 \\
|
||||
& flatten & A $\times$ 2 \\
|
||||
scores$_{\mathrm{RPN}}$& softmax & A $\times$ 2 \\
|
||||
ROI$_{\mathrm{RPN}}$ & sample boxes$_{\mathrm{RPN}}$ and scores$_{\mathrm{RPN}}$ & N$_{RoI}$ $\times$ 6 \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{RoI Head}}\\
|
||||
\midrule
|
||||
& From C$_4$ with ROI$_{\mathrm{RPN}}$: RoI extraction & N$_{RoI}$ $\times$ 7 $\times$ 7 $\times$ 1024 \\
|
||||
R$_1$& ResNet \{C$_5$ without stride\} (Table \ref{table:resnet}) & N$_{RoI}$ $\times$ 7 $\times$ 7 $\times$ 2048 \\
|
||||
ave & average pool & N$_{RoI}$ $\times$ 2048 \\
|
||||
& From ave: fully connected, N$_{cls}$ $\cdot$ 4 & N$_{RoI}$ $\times$ N$_{cls}$ $\cdot$ 4\\
|
||||
boxes & decode bounding boxes (Eq. \ref{eq:pred_bounding_box}) & N$_{RoI}$ $\times$ N$_{cls}$ $\cdot$ 4\\
|
||||
& From ave: fully connected, N$_{cls}$ & N$_{RoI}$ $\times$ N$_{cls}$ \\
|
||||
classes& softmax, N$_{cls}$ & N$_{RoI}$ $\times$ N$_{cls}$ \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{RoI Head: Masks}}\\
|
||||
\midrule
|
||||
M$_0$ & From R$_1$: 2 $\times$ 2 deconv, 256, stride 2 & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ 256 \\
|
||||
& 1 $\times$ 1 conv, N$_{cls}$ & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ N$_{cls}$ \\
|
||||
masks & sigmoid, N$_{cls}$ & N$_{RoI}$ $\times$ 28 $\times$ 28 $\times$ N$_{cls}$ \\
|
||||
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\caption {
|
||||
Mask R-CNN \cite{MaskRCNN} ResNet \cite{ResNet} architecture.
|
||||
Note that this is equivalent to the Faster R-CNN ResNet architecture if the mask
|
||||
head is left out. In Mask R-CNN, bilinear sampling is used for RoI extraction,
|
||||
whereas Faster R-CNN used RoI pooling.
|
||||
}
|
||||
\label{table:maskrcnn_resnet}
|
||||
\end{table}
|
||||
}
|
||||
|
||||
|
||||
\paragraph{Faster R-CNN}
|
||||
After streamlining the CNN components, Fast R-CNN is limited by the speed of the region proposal
|
||||
algorithm, which has to be run prior to the network passes and makes up a large portion of the total
|
||||
@ -313,56 +358,61 @@ bilinear sampling for extracting the RoI features, which is much more precise.
|
||||
In the original RoI pooling from Fast R-CNN, the bins for max-pooling are not aligned with the actual pixel
|
||||
boundary of the bounding box, and thus some detail is lost.
|
||||
|
||||
|
||||
{
|
||||
%\begin{table}[t]
|
||||
%\centering
|
||||
\begin{longtable}{llr}
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\begin{tabular}{llr}
|
||||
\toprule
|
||||
\textbf{Output} & \textbf{Layer Operations} & \textbf{Output Dimensions} \\
|
||||
\midrule\midrule
|
||||
& input image & H $\times$ W $\times$ C \\
|
||||
\midrule
|
||||
C$_4$ & ResNet \{up to C$_4$\} (Table \ref{table:resnet}) & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 1024 \\
|
||||
C$_5$ & ResNet (Table \ref{table:resnet}) & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 1024 \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{Feature Pyramid Network (FPN)}}\\
|
||||
\midrule
|
||||
P$_5$ & From C$_5$: 1 $\times$ 1 conv, 256 & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 256 \\
|
||||
P$_4$ & $\begin{bmatrix}\textrm{skip from C$_4$}\end{bmatrix}_p$ & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 256 \\
|
||||
P$_3$ & $\begin{bmatrix}\textrm{skip from C$_3$}\end{bmatrix}_p$ & $\tfrac{1}{8}$ H $\times$ $\tfrac{1}{8}$ W $\times$ 256 \\
|
||||
P$_2$ & $\begin{bmatrix}\textrm{skip from C$_2$}\end{bmatrix}_p$ & $\tfrac{1}{4}$ H $\times$ $\tfrac{1}{4}$ W $\times$ 256 \\
|
||||
P$_6$ & From P$_5$: 2 $\times$ 2 subsample, 256 & $\tfrac{1}{64}$ H $\times$ $\tfrac{1}{64}$ W $\times$ 256 \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{Region Proposal Network (RPN)}}\\
|
||||
\midrule
|
||||
R$_0$ & From C$_4$: 1 $\times$ 1 conv, 512 & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 512 \\
|
||||
& 1 $\times$ 1 conv, 4 & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ $N_a \cdot$ 4 \\
|
||||
& flatten & A $\times$ 4 \\
|
||||
boxes$_{\mathrm{RPN}}$ & decode bounding boxes (Eq. \ref{eq:pred_bounding_box}) & A $\times$ 4\\
|
||||
& From R$_0$: 1 $\times$ 1 conv, 2 & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ $N_a \cdot$ 2 \\
|
||||
& flatten & A $\times$ 2 \\
|
||||
scores$_{\mathrm{RPN}}$& softmax & A $\times$ 2 \\
|
||||
ROI$_{\mathrm{RPN}}$ & sample boxes$_{\mathrm{RPN}}$ and scores$_{\mathrm{RPN}}$ & N$_{RoI}$ $\times$ 6 \\
|
||||
\multicolumn{3}{c}{$\forall i \in \{2...6\}$}\\
|
||||
& From P$_i$: 1 $\times$ 1 conv, 512 & $\tfrac{1}{2^i}$ H $\times$ $\tfrac{1}{2^i}$ W $\times$ 512 \\
|
||||
& 1 $\times$ 1 conv, 6 & $\tfrac{1}{2^i}$ H $\times$ $\tfrac{1}{2^i}$ W $\times$ $N_a \cdot$ 6 \\
|
||||
RPN$_i$& flatten & A$_i$ $\times$ 6 \\
|
||||
\midrule
|
||||
& From \{RPN$_2$ ... RPN$_6$\}: concatenate & A $\times$ 6 \\
|
||||
& decode bounding boxes (Eq. \ref{eq:pred_bounding_box}) & A $\times$ 6 \\
|
||||
ROI$_{\mathrm{RPN}}$ & sample bounding boxes \& scores & N$_{RoI}$ $\times$ 6 \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{RoI Head}}\\
|
||||
\midrule
|
||||
& From C$_4$ with ROI$_{\mathrm{RPN}}$: RoI extraction & N$_{RoI}$ $\times$ 7 $\times$ 7 $\times$ 1024 \\
|
||||
R$_1$& ResNet \{C$_5$ without stride\} (Table \ref{table:resnet}) & N$_{RoI}$ $\times$ 7 $\times$ 7 $\times$ 2048 \\
|
||||
ave & average pool & N$_{RoI}$ $\times$ 2048 \\
|
||||
& From ave: fully connected, N$_{cls}$ $\cdot$ 4 & N$_{RoI}$ $\times$ N$_{cls}$ $\cdot$ 4\\
|
||||
R$_2$ & From \{P$_2$ ... P$_6$\} with ROI$_{\mathrm{RPN}}$: RoI extraction (Eq. \ref{eq:level_assignment}) & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ 256 \\
|
||||
& 2 $\times$ 2 max pool & N$_{RoI}$ $\times$ 7 $\times$ 7 $\times$ 256 \\
|
||||
F$_1$ & $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & N$_{RoI}$ $\times$ 1024 \\
|
||||
& From F$_1$: fully connected, N$_{cls}$ $\cdot$ 4 & N$_{RoI}$ $\times$ N$_{cls}$ $\cdot$ 4 \\
|
||||
boxes & decode bounding boxes (Eq. \ref{eq:pred_bounding_box}) & N$_{RoI}$ $\times$ N$_{cls}$ $\cdot$ 4\\
|
||||
& From ave: fully connected, N$_{cls}$ & N$_{RoI}$ $\times$ N$_{cls}$ \\
|
||||
& From F$_1$: fully connected, N$_{cls}$ & N$_{RoI}$ $\times$ N$_{cls}$ \\
|
||||
classes& softmax, N$_{cls}$ & N$_{RoI}$ $\times$ N$_{cls}$ \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{RoI Head: Masks}}\\
|
||||
\midrule
|
||||
M$_0$ & From R$_1$: 2 $\times$ 2 deconv, 256, stride 2 & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ 256 \\
|
||||
& 1 $\times$ 1 conv, N$_{cls}$ & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ N$_{cls}$ \\
|
||||
M$_1$ & From R$_2$: $\begin{bmatrix}\textrm{3 $\times$ 3 conv} \end{bmatrix}$ $\times$ 4, 256 & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ 256 \\
|
||||
& 2 $\times$ 2 deconv, 256, stride 2 & N$_{RoI}$ $\times$ 28 $\times$ 28 $\times$ 256 \\
|
||||
& 1 $\times$ 1 conv, N$_{cls}$ & N$_{RoI}$ $\times$ 28 $\times$ 28 $\times$ N$_{cls}$ \\
|
||||
masks & sigmoid, N$_{cls}$ & N$_{RoI}$ $\times$ 28 $\times$ 28 $\times$ N$_{cls}$ \\
|
||||
|
||||
\bottomrule
|
||||
|
||||
\end{tabular}
|
||||
\caption {
|
||||
Mask R-CNN \cite{MaskRCNN} ResNet \cite{ResNet} architecture.
|
||||
Note that this is equivalent to the Faster R-CNN ResNet architecture if the mask
|
||||
head is left out. In Mask R-CNN, bilinear sampling is used for RoI extraction,
|
||||
whereas Faster R-CNN used RoI pooling.
|
||||
Mask R-CNN \cite{MaskRCNN} ResNet-FPN \cite{ResNet} architecture.
|
||||
Operations enclosed in a []$_p$ block make up a single FPN
|
||||
block (see Figure \ref{figure:fpn_block}).
|
||||
}
|
||||
\label{table:maskrcnn_resnet}
|
||||
\end{longtable}
|
||||
%\end{table}
|
||||
\label{table:maskrcnn_resnet_fpn}
|
||||
\end{table}
|
||||
}
|
||||
|
||||
\paragraph{Feature Pyramid Networks}
|
||||
@ -414,64 +464,6 @@ anchor is computed. Now, for example, the smallest boxes are cropped from $P_2$,
|
||||
which is the highest resolution feature map.
|
||||
|
||||
|
||||
{
|
||||
%\begin{table}[t]
|
||||
%\centering
|
||||
\begin{longtable}{llr}
|
||||
\toprule
|
||||
\textbf{Output} & \textbf{Layer Operations} & \textbf{Output Dimensions} \\
|
||||
\midrule\midrule
|
||||
& input image & H $\times$ W $\times$ C \\
|
||||
\midrule
|
||||
C$_5$ & ResNet (Table \ref{table:resnet}) & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 1024 \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{Feature Pyramid Network (FPN)}}\\
|
||||
\midrule
|
||||
P$_5$ & From C$_5$: 1 $\times$ 1 conv, 256 & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 256 \\
|
||||
P$_4$ & $\begin{bmatrix}\textrm{skip from C$_4$}\end{bmatrix}_p$ & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 256 \\
|
||||
P$_3$ & $\begin{bmatrix}\textrm{skip from C$_3$}\end{bmatrix}_p$ & $\tfrac{1}{8}$ H $\times$ $\tfrac{1}{8}$ W $\times$ 256 \\
|
||||
P$_2$ & $\begin{bmatrix}\textrm{skip from C$_2$}\end{bmatrix}_p$ & $\tfrac{1}{4}$ H $\times$ $\tfrac{1}{4}$ W $\times$ 256 \\
|
||||
P$_6$ & From P$_5$: 2 $\times$ 2 subsample, 256 & $\tfrac{1}{64}$ H $\times$ $\tfrac{1}{64}$ W $\times$ 256 \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{Region Proposal Network (RPN)}}\\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{$\forall i \in \{2...6\}$}\\
|
||||
& From P$_i$: 1 $\times$ 1 conv, 512 & $\tfrac{1}{2^i}$ H $\times$ $\tfrac{1}{2^i}$ W $\times$ 512 \\
|
||||
& 1 $\times$ 1 conv, 6 & $\tfrac{1}{2^i}$ H $\times$ $\tfrac{1}{2^i}$ W $\times$ $N_a \cdot$ 6 \\
|
||||
RPN$_i$& flatten & A$_i$ $\times$ 6 \\
|
||||
\midrule
|
||||
& From \{RPN$_2$ ... RPN$_6$\}: concatenate & A $\times$ 6 \\
|
||||
& decode bounding boxes (Eq. \ref{eq:pred_bounding_box}) & A $\times$ 6 \\
|
||||
ROI$_{\mathrm{RPN}}$ & sample bounding boxes \& scores & N$_{RoI}$ $\times$ 6 \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{RoI Head}}\\
|
||||
\midrule
|
||||
R$_2$ & From \{P$_2$ ... P$_6$\} with ROI$_{\mathrm{RPN}}$: RoI extraction (Eq. \ref{eq:level_assignment}) & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ 256 \\
|
||||
& 2 $\times$ 2 max pool & N$_{RoI}$ $\times$ 7 $\times$ 7 $\times$ 256 \\
|
||||
F$_1$ & $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & N$_{RoI}$ $\times$ 1024 \\
|
||||
& From F$_1$: fully connected, N$_{cls}$ $\cdot$ 4 & N$_{RoI}$ $\times$ N$_{cls}$ $\cdot$ 4 \\
|
||||
boxes & decode bounding boxes (Eq. \ref{eq:pred_bounding_box}) & N$_{RoI}$ $\times$ N$_{cls}$ $\cdot$ 4\\
|
||||
& From F$_1$: fully connected, N$_{cls}$ & N$_{RoI}$ $\times$ N$_{cls}$ \\
|
||||
classes& softmax, N$_{cls}$ & N$_{RoI}$ $\times$ N$_{cls}$ \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{RoI Head: Masks}}\\
|
||||
\midrule
|
||||
M$_1$ & From R$_2$: $\begin{bmatrix}\textrm{3 $\times$ 3 conv} \end{bmatrix}$ $\times$ 4, 256 & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ 256 \\
|
||||
& 2 $\times$ 2 deconv, 256, stride 2 & N$_{RoI}$ $\times$ 28 $\times$ 28 $\times$ 256 \\
|
||||
& 1 $\times$ 1 conv, N$_{cls}$ & N$_{RoI}$ $\times$ 28 $\times$ 28 $\times$ N$_{cls}$ \\
|
||||
masks & sigmoid, N$_{cls}$ & N$_{RoI}$ $\times$ 28 $\times$ 28 $\times$ N$_{cls}$ \\
|
||||
\bottomrule
|
||||
|
||||
\caption {
|
||||
Mask R-CNN \cite{MaskRCNN} ResNet-FPN \cite{ResNet} architecture.
|
||||
Operations enclosed in a []$_p$ block make up a single FPN
|
||||
block (see Figure \ref{figure:fpn_block}).
|
||||
}
|
||||
\label{table:maskrcnn_resnet_fpn}
|
||||
\end{longtable}
|
||||
%\end{table}
|
||||
}
|
||||
|
||||
\begin{figure}[t]
|
||||
\centering
|
||||
\includegraphics[width=0.3\textwidth]{figures/fpn}
|
||||
@ -497,9 +489,16 @@ For regression, we define the smooth-$\ell_1$ regression loss as
|
||||
which provides a certain robustness to outliers and will be used
|
||||
frequently in the following chapters. For vector or tuple arguments, the sum of the componentwise scalar
|
||||
losses is computed.
|
||||
For classification we define $\ell_{cls}$ as the cross-entropy classification loss.
|
||||
|
||||
\todo{formally define cross-entropy losses?}
|
||||
For classification with mutually exclusive classes, we define the categorical (softmax) cross-entropy loss,
|
||||
\begin{equation}
|
||||
\ell_{cls}(c, c^*) = -\log(),
|
||||
\end{equation}
|
||||
where $c^* \in \{0,N_{cls}\}$ is a label (or vector of labels) and $c \in (0,1)$ is the output of a softmax layer. % TODO label has wrong range
|
||||
Finally, for multi-label classification, we define the binary (sigmoid) cross-entropy loss,
|
||||
\begin{equation}
|
||||
\ell_{cls*}(y, y^*) = -y^* \cdot \log(y) - (1 - y^*) \cdot \log(1 - y),
|
||||
\end{equation}
|
||||
where $y^* \in \{0,1\}$ is a label (or vector of labels) and $y \in (0,1)$ is the output of a sigmoid layer.
|
||||
|
||||
\label{ssec:rcnn_techn}
|
||||
\paragraph{Bounding box regression}
|
||||
@ -626,7 +625,7 @@ L_{box} = \frac{1}{N_{RoI}^{fg}} \sum_{i=1}^{N_{RoI}} [c_i^* \geq 1] \cdot \ell_
|
||||
\end{equation}
|
||||
is the average smooth-$\ell_1$ bounding box regression loss,
|
||||
\begin{equation}
|
||||
L_{mask} = \frac{1}{N_{RoI}^{fg}} \sum_{i=1}^{N_{RoI}} [c_i^* \geq 1] \cdot \ell_{cls}(m_i,m_i^*)
|
||||
L_{mask} = \frac{1}{N_{RoI}^{fg}} \sum_{i=1}^{N_{RoI}} [c_i^* \geq 1] \cdot \ell_{cls*}(m_i,m_i^*)
|
||||
\end{equation}
|
||||
is the average binary cross-entropy mask loss,
|
||||
\begin{equation}
|
||||
|
||||
2
bib.bib
2
bib.bib
@ -294,7 +294,7 @@
|
||||
@inproceedings{UnsupFlownet,
|
||||
title={Back to Basics: Unsupervised Learning of Optical Flow via Brightness Constancy and Motion Smoothness},
|
||||
author={Jason J. Yu and Adam W. Harley and Konstantinos G. Derpanis},
|
||||
booktitle={ECCV Workshops},
|
||||
booktitle={ECCV 2016 Workshops},
|
||||
year={2016}}
|
||||
|
||||
@article{ImageNet,
|
||||
|
||||
@ -111,7 +111,7 @@ let $R^{k,c_k}, t^{k,c_k}, p^{k,c_k}, o^{k,c_k}$ be the predicted motion for cla
|
||||
and $R^{gt,i_k}, t^{gt,i_k}, p^{gt,i_k}, o^{gt,i_k}$ the ground truth motion for the example $i_k$.
|
||||
Then, assuming there are $N$ such detections,
|
||||
\begin{equation}
|
||||
E_{R} = \frac{1}{N}\sum_k \arccos\left( \min\left\{1, \max\left\{-1, \frac{tr(\mathrm{inv}(R^{k,c_k}) \cdot R^{gt,i_k}) - 1}{2} \right\}\right\} \right)
|
||||
E_{R} = \frac{1}{N}\sum_k \arccos\left( \min\left\{1, \max\left\{-1, \frac{\mathrm{tr}(\mathrm{inv}(R^{k,c_k}) \cdot R^{gt,i_k}) - 1}{2} \right\}\right\} \right)
|
||||
\end{equation}
|
||||
measures the mean angle of the error rotation between predicted and ground truth rotation,
|
||||
\begin{equation}
|
||||
@ -125,24 +125,24 @@ is the mean euclidean norm between predicted and ground truth pivot.
|
||||
Moreover, we define precision and recall measures for the detection of moving objects,
|
||||
where
|
||||
\begin{equation}
|
||||
O_{pr} = \frac{tp}{tp + fp}
|
||||
O_{pr} = \frac{\mathit{TP}}{\mathit{TP} + \mathit{FP}}
|
||||
\end{equation}
|
||||
is the fraction of objects which are actually moving among all objects classified as moving,
|
||||
and
|
||||
\begin{equation}
|
||||
O_{rc} = \frac{tp}{tp + fn}
|
||||
O_{rc} = \frac{\mathit{TP}}{\mathit{TP} + \mathit{FN}}
|
||||
\end{equation}
|
||||
is the fraction of objects correctly classified as moving among all objects which are actually moving.
|
||||
Here, we used
|
||||
\begin{equation}
|
||||
tp = \sum_k [o^{k,c_k} = 1 \land o^{gt,i_k} = 1],
|
||||
\mathit{TP} = \sum_k [o^{k,c_k} = 1 \land o^{gt,i_k} = 1],
|
||||
\end{equation}
|
||||
\begin{equation}
|
||||
fp = \sum_k [o^{k,c_k} = 1 \land o^{gt,i_k} = 0],
|
||||
\mathit{FP} = \sum_k [o^{k,c_k} = 1 \land o^{gt,i_k} = 0],
|
||||
\end{equation}
|
||||
and
|
||||
\begin{equation}
|
||||
fn = \sum_k [o^{k,c_k} = 0 \land o^{gt,i_k} = 1].
|
||||
\mathit{FN} = \sum_k [o^{k,c_k} = 0 \land o^{gt,i_k} = 1].
|
||||
\end{equation}
|
||||
Analogously, we define error metrics $E_{R}^{cam}$ and $E_{t}^{cam}$ for
|
||||
predicted camera motions.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user