From 5b046e41b585ac2e8553ef290fc43f5bc1007d19 Mon Sep 17 00:00:00 2001 From: Simon Meister Date: Sun, 19 Nov 2017 15:39:29 +0100 Subject: [PATCH] WIP --- background.tex | 263 ++++++++++++++++++++++++------------------------ bib.bib | 2 +- experiments.tex | 12 +-- 3 files changed, 138 insertions(+), 139 deletions(-) diff --git a/background.tex b/background.tex index 041a6fa..00b9540 100644 --- a/background.tex +++ b/background.tex @@ -17,28 +17,10 @@ to estimate disparity-based depth, however monocular depth estimation with deep popular \cite{DeeperDepth, UnsupPoseDepth}. In this preliminary work, we will assume per-pixel depth to be given. -\subsection{CNNs for dense motion estimation} -Deep convolutional neural network (CNN) architectures -\cite{ImageNetCNN, VGGNet, ResNet} -became widely popular through numerous successes in classification and recognition tasks. -The general structure of a CNN consists of a convolutional encoder, which -learns a spatially compressed, wide (in the number of channels) representation of the input image, -and a fully-connected prediction network on top of the encoder. - -The compressed representations learned by CNNs of these categories do not, however, allow -for prediction of high-resolution output, as spatial detail is lost through sequential applications -of pooling or strides. -Thus, networks for dense prediction introduce a convolutional decoder on top of the representation encoder, -performing upsampling of the compressed features and resulting in a encoder-decoder pyramid. -The most popular deep networks of this kind for end-to-end optical flow prediction -are variants of the FlowNet family \cite{FlowNet, FlowNet2}, -which was recently extended to scene flow estimation \cite{SceneFlowDataset}. -Table \ref{table:flownets} shows the classical FlowNetS architecture for optical flow prediction. - { -%\begin{table}[h] -%\centering -\begin{longtable}{llr} +\begin{table}[h] +\centering +\begin{tabular}{llr} \toprule \textbf{Output} & \textbf{Layer Operations} & \textbf{Output Dimensions} \\ \midrule\midrule @@ -63,18 +45,33 @@ Table \ref{table:flownets} shows the classical FlowNetS architecture for optical \midrule flow & $\times$ 2 bilinear upsample & H $\times$ W $\times$ 2 \\ \bottomrule - +\end{tabular} \caption { FlowNetS \cite{FlowNet} architecture. Transpose convolutions (deconvolutions) are used for refinement. } \label{table:flownets} -\end{longtable} - - -%\end{table} +\end{table} } +\subsection{CNNs for dense motion estimation} +Deep convolutional neural network (CNN) architectures +\cite{ImageNetCNN, VGGNet, ResNet} +became widely popular through numerous successes in classification and recognition tasks. +The general structure of a CNN consists of a convolutional encoder, which +learns a spatially compressed, wide (in the number of channels) representation of the input image, +and a fully-connected prediction network on top of the encoder. + +The compressed representations learned by CNNs of these categories do not, however, allow +for prediction of high-resolution output, as spatial detail is lost through sequential applications +of pooling or strides. +Thus, networks for dense prediction introduce a convolutional decoder on top of the representation encoder, +performing upsampling of the compressed features and resulting in a encoder-decoder pyramid. +The most popular deep networks of this kind for end-to-end optical flow prediction +are variants of the FlowNet family \cite{FlowNet, FlowNet2}, +which was recently extended to scene flow estimation \cite{SceneFlowDataset}. +Table \ref{table:flownets} shows the classical FlowNetS architecture for optical flow prediction. + Note that the network itself is a rather generic autoencoder and is specialized for optical flow only through being trained with supervision from dense optical flow ground truth. Potentially, the same network could also be used for semantic segmentation if @@ -93,9 +90,9 @@ are predicted in addition to a depth map, and a unsupervised re-projection loss image brightness differences penalizes the predictions. { -%\begin{table}[h] -%\centering -\begin{longtable}{llr} +\begin{table}[h] +\centering +\begin{tabular}{llr} \toprule \textbf{Output} & \textbf{Layer Operations} & \textbf{Output Dimensions} \\ \midrule\midrule @@ -117,6 +114,9 @@ camera motion & From FC: $\times$ 2 & H $\times$ W $\times$ 6 \\ depth & 1 $\times$1 conv, 1 & H $\times$ W $\times$ 1 \\ \bottomrule + +\end{tabular} + \caption { SfM-Net \cite{SfmNet} architecture. Here, Conv-Deconv is a simple fully convolutional encoder-decoder network, where convolutions and deconvolutions with stride 2 are @@ -126,10 +126,7 @@ The Conv-Deconv weights for the structure and motion networks are not shared, and N$_{motions} = 3$. } \label{table:sfmnet} -\end{longtable} - - -%\end{table} +\end{table} } \subsection{ResNet} @@ -158,9 +155,9 @@ to increase the bottleneck stride to 64, following FlowNetS. { -%\begin{table}[h] -%\centering -\begin{longtable}{llr} +\begin{table}[h] +\centering +\begin{tabular}{llr} \toprule \textbf{Output} & \textbf{Layer Operations} & \textbf{Output Dimensions} \\ \midrule\midrule @@ -213,7 +210,7 @@ $\begin{bmatrix} & $\tfrac{1}{64}$ H $\times$ $\tfrac{1}{64}$ W $\times$ 2048 \\ \bottomrule - +\end{tabular} \caption { Backbone architecture based on ResNet-50 \cite{ResNet}. Operations enclosed in a []$_b$ block make up a single ResNet \enquote{bottleneck} @@ -223,10 +220,7 @@ is only applied to the first block, but not to repeated blocks. Batch normalization \cite{BN} is used after every residual unit. } \label{table:resnet} -\end{longtable} - - -%\end{table} +\end{table} } \begin{figure}[t] @@ -267,6 +261,57 @@ Thus, given region proposals, the per-region computation is reduced to a single speeding up the system by two orders of magnitude at inference time and one order of magnitude at training time. +{ +\begin{table}[t] +\centering +\begin{tabular}{llr} +\toprule +\textbf{Output} & \textbf{Layer Operations} & \textbf{Output Dimensions} \\ +\midrule\midrule +& input image & H $\times$ W $\times$ C \\ +\midrule +C$_4$ & ResNet \{up to C$_4$\} (Table \ref{table:resnet}) & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 1024 \\ +\midrule +\multicolumn{3}{c}{\textbf{Region Proposal Network (RPN)}}\\ +\midrule +R$_0$ & From C$_4$: 1 $\times$ 1 conv, 512 & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 512 \\ +& 1 $\times$ 1 conv, 4 & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ $N_a \cdot$ 4 \\ +& flatten & A $\times$ 4 \\ +boxes$_{\mathrm{RPN}}$ & decode bounding boxes (Eq. \ref{eq:pred_bounding_box}) & A $\times$ 4\\ +& From R$_0$: 1 $\times$ 1 conv, 2 & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ $N_a \cdot$ 2 \\ +& flatten & A $\times$ 2 \\ +scores$_{\mathrm{RPN}}$& softmax & A $\times$ 2 \\ +ROI$_{\mathrm{RPN}}$ & sample boxes$_{\mathrm{RPN}}$ and scores$_{\mathrm{RPN}}$ & N$_{RoI}$ $\times$ 6 \\ +\midrule +\multicolumn{3}{c}{\textbf{RoI Head}}\\ +\midrule +& From C$_4$ with ROI$_{\mathrm{RPN}}$: RoI extraction & N$_{RoI}$ $\times$ 7 $\times$ 7 $\times$ 1024 \\ +R$_1$& ResNet \{C$_5$ without stride\} (Table \ref{table:resnet}) & N$_{RoI}$ $\times$ 7 $\times$ 7 $\times$ 2048 \\ +ave & average pool & N$_{RoI}$ $\times$ 2048 \\ +& From ave: fully connected, N$_{cls}$ $\cdot$ 4 & N$_{RoI}$ $\times$ N$_{cls}$ $\cdot$ 4\\ +boxes & decode bounding boxes (Eq. \ref{eq:pred_bounding_box}) & N$_{RoI}$ $\times$ N$_{cls}$ $\cdot$ 4\\ +& From ave: fully connected, N$_{cls}$ & N$_{RoI}$ $\times$ N$_{cls}$ \\ +classes& softmax, N$_{cls}$ & N$_{RoI}$ $\times$ N$_{cls}$ \\ +\midrule +\multicolumn{3}{c}{\textbf{RoI Head: Masks}}\\ +\midrule +M$_0$ & From R$_1$: 2 $\times$ 2 deconv, 256, stride 2 & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ 256 \\ +& 1 $\times$ 1 conv, N$_{cls}$ & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ N$_{cls}$ \\ +masks & sigmoid, N$_{cls}$ & N$_{RoI}$ $\times$ 28 $\times$ 28 $\times$ N$_{cls}$ \\ + +\bottomrule +\end{tabular} +\caption { +Mask R-CNN \cite{MaskRCNN} ResNet \cite{ResNet} architecture. +Note that this is equivalent to the Faster R-CNN ResNet architecture if the mask +head is left out. In Mask R-CNN, bilinear sampling is used for RoI extraction, +whereas Faster R-CNN used RoI pooling. +} +\label{table:maskrcnn_resnet} +\end{table} +} + + \paragraph{Faster R-CNN} After streamlining the CNN components, Fast R-CNN is limited by the speed of the region proposal algorithm, which has to be run prior to the network passes and makes up a large portion of the total @@ -313,56 +358,61 @@ bilinear sampling for extracting the RoI features, which is much more precise. In the original RoI pooling from Fast R-CNN, the bins for max-pooling are not aligned with the actual pixel boundary of the bounding box, and thus some detail is lost. - { -%\begin{table}[t] -%\centering -\begin{longtable}{llr} +\begin{table}[h] +\centering +\begin{tabular}{llr} \toprule \textbf{Output} & \textbf{Layer Operations} & \textbf{Output Dimensions} \\ \midrule\midrule & input image & H $\times$ W $\times$ C \\ \midrule -C$_4$ & ResNet \{up to C$_4$\} (Table \ref{table:resnet}) & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 1024 \\ +C$_5$ & ResNet (Table \ref{table:resnet}) & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 1024 \\ +\midrule +\multicolumn{3}{c}{\textbf{Feature Pyramid Network (FPN)}}\\ +\midrule +P$_5$ & From C$_5$: 1 $\times$ 1 conv, 256 & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 256 \\ +P$_4$ & $\begin{bmatrix}\textrm{skip from C$_4$}\end{bmatrix}_p$ & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 256 \\ +P$_3$ & $\begin{bmatrix}\textrm{skip from C$_3$}\end{bmatrix}_p$ & $\tfrac{1}{8}$ H $\times$ $\tfrac{1}{8}$ W $\times$ 256 \\ +P$_2$ & $\begin{bmatrix}\textrm{skip from C$_2$}\end{bmatrix}_p$ & $\tfrac{1}{4}$ H $\times$ $\tfrac{1}{4}$ W $\times$ 256 \\ +P$_6$ & From P$_5$: 2 $\times$ 2 subsample, 256 & $\tfrac{1}{64}$ H $\times$ $\tfrac{1}{64}$ W $\times$ 256 \\ \midrule \multicolumn{3}{c}{\textbf{Region Proposal Network (RPN)}}\\ \midrule -R$_0$ & From C$_4$: 1 $\times$ 1 conv, 512 & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 512 \\ -& 1 $\times$ 1 conv, 4 & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ $N_a \cdot$ 4 \\ -& flatten & A $\times$ 4 \\ -boxes$_{\mathrm{RPN}}$ & decode bounding boxes (Eq. \ref{eq:pred_bounding_box}) & A $\times$ 4\\ -& From R$_0$: 1 $\times$ 1 conv, 2 & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ $N_a \cdot$ 2 \\ -& flatten & A $\times$ 2 \\ -scores$_{\mathrm{RPN}}$& softmax & A $\times$ 2 \\ -ROI$_{\mathrm{RPN}}$ & sample boxes$_{\mathrm{RPN}}$ and scores$_{\mathrm{RPN}}$ & N$_{RoI}$ $\times$ 6 \\ +\multicolumn{3}{c}{$\forall i \in \{2...6\}$}\\ +& From P$_i$: 1 $\times$ 1 conv, 512 & $\tfrac{1}{2^i}$ H $\times$ $\tfrac{1}{2^i}$ W $\times$ 512 \\ +& 1 $\times$ 1 conv, 6 & $\tfrac{1}{2^i}$ H $\times$ $\tfrac{1}{2^i}$ W $\times$ $N_a \cdot$ 6 \\ +RPN$_i$& flatten & A$_i$ $\times$ 6 \\ +\midrule +& From \{RPN$_2$ ... RPN$_6$\}: concatenate & A $\times$ 6 \\ +& decode bounding boxes (Eq. \ref{eq:pred_bounding_box}) & A $\times$ 6 \\ +ROI$_{\mathrm{RPN}}$ & sample bounding boxes \& scores & N$_{RoI}$ $\times$ 6 \\ \midrule \multicolumn{3}{c}{\textbf{RoI Head}}\\ \midrule -& From C$_4$ with ROI$_{\mathrm{RPN}}$: RoI extraction & N$_{RoI}$ $\times$ 7 $\times$ 7 $\times$ 1024 \\ -R$_1$& ResNet \{C$_5$ without stride\} (Table \ref{table:resnet}) & N$_{RoI}$ $\times$ 7 $\times$ 7 $\times$ 2048 \\ -ave & average pool & N$_{RoI}$ $\times$ 2048 \\ -& From ave: fully connected, N$_{cls}$ $\cdot$ 4 & N$_{RoI}$ $\times$ N$_{cls}$ $\cdot$ 4\\ +R$_2$ & From \{P$_2$ ... P$_6$\} with ROI$_{\mathrm{RPN}}$: RoI extraction (Eq. \ref{eq:level_assignment}) & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ 256 \\ +& 2 $\times$ 2 max pool & N$_{RoI}$ $\times$ 7 $\times$ 7 $\times$ 256 \\ +F$_1$ & $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & N$_{RoI}$ $\times$ 1024 \\ +& From F$_1$: fully connected, N$_{cls}$ $\cdot$ 4 & N$_{RoI}$ $\times$ N$_{cls}$ $\cdot$ 4 \\ boxes & decode bounding boxes (Eq. \ref{eq:pred_bounding_box}) & N$_{RoI}$ $\times$ N$_{cls}$ $\cdot$ 4\\ -& From ave: fully connected, N$_{cls}$ & N$_{RoI}$ $\times$ N$_{cls}$ \\ +& From F$_1$: fully connected, N$_{cls}$ & N$_{RoI}$ $\times$ N$_{cls}$ \\ classes& softmax, N$_{cls}$ & N$_{RoI}$ $\times$ N$_{cls}$ \\ \midrule \multicolumn{3}{c}{\textbf{RoI Head: Masks}}\\ \midrule -M$_0$ & From R$_1$: 2 $\times$ 2 deconv, 256, stride 2 & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ 256 \\ -& 1 $\times$ 1 conv, N$_{cls}$ & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ N$_{cls}$ \\ +M$_1$ & From R$_2$: $\begin{bmatrix}\textrm{3 $\times$ 3 conv} \end{bmatrix}$ $\times$ 4, 256 & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ 256 \\ +& 2 $\times$ 2 deconv, 256, stride 2 & N$_{RoI}$ $\times$ 28 $\times$ 28 $\times$ 256 \\ +& 1 $\times$ 1 conv, N$_{cls}$ & N$_{RoI}$ $\times$ 28 $\times$ 28 $\times$ N$_{cls}$ \\ masks & sigmoid, N$_{cls}$ & N$_{RoI}$ $\times$ 28 $\times$ 28 $\times$ N$_{cls}$ \\ - \bottomrule - +\end{tabular} \caption { -Mask R-CNN \cite{MaskRCNN} ResNet \cite{ResNet} architecture. -Note that this is equivalent to the Faster R-CNN ResNet architecture if the mask -head is left out. In Mask R-CNN, bilinear sampling is used for RoI extraction, -whereas Faster R-CNN used RoI pooling. +Mask R-CNN \cite{MaskRCNN} ResNet-FPN \cite{ResNet} architecture. +Operations enclosed in a []$_p$ block make up a single FPN +block (see Figure \ref{figure:fpn_block}). } -\label{table:maskrcnn_resnet} -\end{longtable} -%\end{table} +\label{table:maskrcnn_resnet_fpn} +\end{table} } \paragraph{Feature Pyramid Networks} @@ -414,64 +464,6 @@ anchor is computed. Now, for example, the smallest boxes are cropped from $P_2$, which is the highest resolution feature map. -{ -%\begin{table}[t] -%\centering -\begin{longtable}{llr} -\toprule -\textbf{Output} & \textbf{Layer Operations} & \textbf{Output Dimensions} \\ -\midrule\midrule -& input image & H $\times$ W $\times$ C \\ -\midrule -C$_5$ & ResNet (Table \ref{table:resnet}) & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 1024 \\ -\midrule -\multicolumn{3}{c}{\textbf{Feature Pyramid Network (FPN)}}\\ -\midrule -P$_5$ & From C$_5$: 1 $\times$ 1 conv, 256 & $\tfrac{1}{32}$ H $\times$ $\tfrac{1}{32}$ W $\times$ 256 \\ -P$_4$ & $\begin{bmatrix}\textrm{skip from C$_4$}\end{bmatrix}_p$ & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 256 \\ -P$_3$ & $\begin{bmatrix}\textrm{skip from C$_3$}\end{bmatrix}_p$ & $\tfrac{1}{8}$ H $\times$ $\tfrac{1}{8}$ W $\times$ 256 \\ -P$_2$ & $\begin{bmatrix}\textrm{skip from C$_2$}\end{bmatrix}_p$ & $\tfrac{1}{4}$ H $\times$ $\tfrac{1}{4}$ W $\times$ 256 \\ -P$_6$ & From P$_5$: 2 $\times$ 2 subsample, 256 & $\tfrac{1}{64}$ H $\times$ $\tfrac{1}{64}$ W $\times$ 256 \\ -\midrule -\multicolumn{3}{c}{\textbf{Region Proposal Network (RPN)}}\\ -\midrule -\multicolumn{3}{c}{$\forall i \in \{2...6\}$}\\ -& From P$_i$: 1 $\times$ 1 conv, 512 & $\tfrac{1}{2^i}$ H $\times$ $\tfrac{1}{2^i}$ W $\times$ 512 \\ -& 1 $\times$ 1 conv, 6 & $\tfrac{1}{2^i}$ H $\times$ $\tfrac{1}{2^i}$ W $\times$ $N_a \cdot$ 6 \\ -RPN$_i$& flatten & A$_i$ $\times$ 6 \\ -\midrule -& From \{RPN$_2$ ... RPN$_6$\}: concatenate & A $\times$ 6 \\ -& decode bounding boxes (Eq. \ref{eq:pred_bounding_box}) & A $\times$ 6 \\ -ROI$_{\mathrm{RPN}}$ & sample bounding boxes \& scores & N$_{RoI}$ $\times$ 6 \\ -\midrule -\multicolumn{3}{c}{\textbf{RoI Head}}\\ -\midrule -R$_2$ & From \{P$_2$ ... P$_6$\} with ROI$_{\mathrm{RPN}}$: RoI extraction (Eq. \ref{eq:level_assignment}) & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ 256 \\ -& 2 $\times$ 2 max pool & N$_{RoI}$ $\times$ 7 $\times$ 7 $\times$ 256 \\ -F$_1$ & $\begin{bmatrix}\textrm{fully connected}, 1024\end{bmatrix}$ $\times$ 2 & N$_{RoI}$ $\times$ 1024 \\ -& From F$_1$: fully connected, N$_{cls}$ $\cdot$ 4 & N$_{RoI}$ $\times$ N$_{cls}$ $\cdot$ 4 \\ -boxes & decode bounding boxes (Eq. \ref{eq:pred_bounding_box}) & N$_{RoI}$ $\times$ N$_{cls}$ $\cdot$ 4\\ -& From F$_1$: fully connected, N$_{cls}$ & N$_{RoI}$ $\times$ N$_{cls}$ \\ -classes& softmax, N$_{cls}$ & N$_{RoI}$ $\times$ N$_{cls}$ \\ -\midrule -\multicolumn{3}{c}{\textbf{RoI Head: Masks}}\\ -\midrule -M$_1$ & From R$_2$: $\begin{bmatrix}\textrm{3 $\times$ 3 conv} \end{bmatrix}$ $\times$ 4, 256 & N$_{RoI}$ $\times$ 14 $\times$ 14 $\times$ 256 \\ -& 2 $\times$ 2 deconv, 256, stride 2 & N$_{RoI}$ $\times$ 28 $\times$ 28 $\times$ 256 \\ -& 1 $\times$ 1 conv, N$_{cls}$ & N$_{RoI}$ $\times$ 28 $\times$ 28 $\times$ N$_{cls}$ \\ -masks & sigmoid, N$_{cls}$ & N$_{RoI}$ $\times$ 28 $\times$ 28 $\times$ N$_{cls}$ \\ -\bottomrule - -\caption { -Mask R-CNN \cite{MaskRCNN} ResNet-FPN \cite{ResNet} architecture. -Operations enclosed in a []$_p$ block make up a single FPN -block (see Figure \ref{figure:fpn_block}). -} -\label{table:maskrcnn_resnet_fpn} -\end{longtable} -%\end{table} -} - \begin{figure}[t] \centering \includegraphics[width=0.3\textwidth]{figures/fpn} @@ -497,9 +489,16 @@ For regression, we define the smooth-$\ell_1$ regression loss as which provides a certain robustness to outliers and will be used frequently in the following chapters. For vector or tuple arguments, the sum of the componentwise scalar losses is computed. -For classification we define $\ell_{cls}$ as the cross-entropy classification loss. - -\todo{formally define cross-entropy losses?} +For classification with mutually exclusive classes, we define the categorical (softmax) cross-entropy loss, +\begin{equation} +\ell_{cls}(c, c^*) = -\log(), +\end{equation} +where $c^* \in \{0,N_{cls}\}$ is a label (or vector of labels) and $c \in (0,1)$ is the output of a softmax layer. % TODO label has wrong range +Finally, for multi-label classification, we define the binary (sigmoid) cross-entropy loss, +\begin{equation} +\ell_{cls*}(y, y^*) = -y^* \cdot \log(y) - (1 - y^*) \cdot \log(1 - y), +\end{equation} +where $y^* \in \{0,1\}$ is a label (or vector of labels) and $y \in (0,1)$ is the output of a sigmoid layer. \label{ssec:rcnn_techn} \paragraph{Bounding box regression} @@ -626,7 +625,7 @@ L_{box} = \frac{1}{N_{RoI}^{fg}} \sum_{i=1}^{N_{RoI}} [c_i^* \geq 1] \cdot \ell_ \end{equation} is the average smooth-$\ell_1$ bounding box regression loss, \begin{equation} -L_{mask} = \frac{1}{N_{RoI}^{fg}} \sum_{i=1}^{N_{RoI}} [c_i^* \geq 1] \cdot \ell_{cls}(m_i,m_i^*) +L_{mask} = \frac{1}{N_{RoI}^{fg}} \sum_{i=1}^{N_{RoI}} [c_i^* \geq 1] \cdot \ell_{cls*}(m_i,m_i^*) \end{equation} is the average binary cross-entropy mask loss, \begin{equation} diff --git a/bib.bib b/bib.bib index 9ee4c4e..7cee841 100644 --- a/bib.bib +++ b/bib.bib @@ -294,7 +294,7 @@ @inproceedings{UnsupFlownet, title={Back to Basics: Unsupervised Learning of Optical Flow via Brightness Constancy and Motion Smoothness}, author={Jason J. Yu and Adam W. Harley and Konstantinos G. Derpanis}, - booktitle={ECCV Workshops}, + booktitle={ECCV 2016 Workshops}, year={2016}} @article{ImageNet, diff --git a/experiments.tex b/experiments.tex index b33fd57..87d2853 100644 --- a/experiments.tex +++ b/experiments.tex @@ -111,7 +111,7 @@ let $R^{k,c_k}, t^{k,c_k}, p^{k,c_k}, o^{k,c_k}$ be the predicted motion for cla and $R^{gt,i_k}, t^{gt,i_k}, p^{gt,i_k}, o^{gt,i_k}$ the ground truth motion for the example $i_k$. Then, assuming there are $N$ such detections, \begin{equation} -E_{R} = \frac{1}{N}\sum_k \arccos\left( \min\left\{1, \max\left\{-1, \frac{tr(\mathrm{inv}(R^{k,c_k}) \cdot R^{gt,i_k}) - 1}{2} \right\}\right\} \right) +E_{R} = \frac{1}{N}\sum_k \arccos\left( \min\left\{1, \max\left\{-1, \frac{\mathrm{tr}(\mathrm{inv}(R^{k,c_k}) \cdot R^{gt,i_k}) - 1}{2} \right\}\right\} \right) \end{equation} measures the mean angle of the error rotation between predicted and ground truth rotation, \begin{equation} @@ -125,24 +125,24 @@ is the mean euclidean norm between predicted and ground truth pivot. Moreover, we define precision and recall measures for the detection of moving objects, where \begin{equation} -O_{pr} = \frac{tp}{tp + fp} +O_{pr} = \frac{\mathit{TP}}{\mathit{TP} + \mathit{FP}} \end{equation} is the fraction of objects which are actually moving among all objects classified as moving, and \begin{equation} -O_{rc} = \frac{tp}{tp + fn} +O_{rc} = \frac{\mathit{TP}}{\mathit{TP} + \mathit{FN}} \end{equation} is the fraction of objects correctly classified as moving among all objects which are actually moving. Here, we used \begin{equation} -tp = \sum_k [o^{k,c_k} = 1 \land o^{gt,i_k} = 1], +\mathit{TP} = \sum_k [o^{k,c_k} = 1 \land o^{gt,i_k} = 1], \end{equation} \begin{equation} -fp = \sum_k [o^{k,c_k} = 1 \land o^{gt,i_k} = 0], +\mathit{FP} = \sum_k [o^{k,c_k} = 1 \land o^{gt,i_k} = 0], \end{equation} and \begin{equation} -fn = \sum_k [o^{k,c_k} = 0 \land o^{gt,i_k} = 1]. +\mathit{FN} = \sum_k [o^{k,c_k} = 0 \land o^{gt,i_k} = 1]. \end{equation} Analogously, we define error metrics $E_{R}^{cam}$ and $E_{t}^{cam}$ for predicted camera motions.