pre-final

This commit is contained in:
Simon Meister 2017-11-22 15:50:35 +01:00
parent b895fa9f18
commit d0d7e6b176
7 changed files with 34 additions and 141 deletions

View File

@ -33,13 +33,10 @@ and compose a dense optical flow field based on instance-level and global motion
predictions. We train our network on the synthetic Virtual KITTI dataset,
which provides ground truth for all components of our system.
\end{abstract}
\renewcommand{\abstractname}{Zusammenfassung}
\begin{abstract}
\subsection*{\textbf{Zusammenfassung}}
Mit dem Aufkommen von Deep Learning
ist die Umfunktionierung generischer Deep Networks ein
ist das Umfunktionieren generischer Deep Networks ein
beliebter Ansatz für klassische Probleme der Computer Vision geworden,
die pixelweise Schätzung erfordern.

View File

@ -255,7 +255,7 @@ performs better in our case than the standard $\ell_1$-loss.
We thus compute the RoI motion loss as
\begin{equation}
\text{L}_{motion} = \frac{1}{\text{N}_{RoI}^{\mathit{fg}}} \sum_k^{\text{N}_{RoI}} l_{p}^k + (l_{R}^k + l_{t}^k) \cdot o_k^* + l_o^k,
L_{motion} = \frac{1}{\text{N}_{RoI}^{\mathit{fg}}} \sum_k^{\text{N}_{RoI}} l_{p}^k + (l_{R}^k + l_{t}^k) \cdot o_k^* + l_o^k,
\end{equation}
where
\begin{equation}
@ -284,7 +284,7 @@ other than $c_k^*$ are not penalized.
Now, our modified RoI loss is
\begin{equation}
\text{L}_{RoI} = \text{L}_{cls} + \text{L}_{box} + \text{L}_{mask} + \text{L}_{motion}.
L_{RoI} = L_{cls} + L_{box} + L_{mask} + L_{motion}.
\end{equation}
\paragraph{Camera motion supervision}
@ -339,7 +339,7 @@ full image resolution, as
the depth crops and 2D point grid are at the same resolution as the predicted
$m \times m$ mask.
For each RoI, we can now compute $\text{L}_{RoI}$ and thus supervise the object motion
For each RoI, we can now compute $L_{RoI}$ and thus supervise the object motion
by penalizing the $m \times m$ optical flow grid.
If there is optical flow ground truth available, we can use the RoI bounding box to
crop and resize a region from the ground truth optical flow to match the RoI's
@ -377,7 +377,7 @@ Again, as for masks and bounding boxes in Mask R-CNN,
the predicted output object motions are the predicted object motions for the
highest scoring class.
\subsection{Dense flow from motion}
\subsection{Dense flow from 3D motion}
\label{ssec:postprocessing}
As a postprocessing, we compose the dense optical flow between $I_t$ and $I_{t+1}$ from the outputs of our Motion R-CNN network.
Given the depth map $d_t$ for frame $I_t$, we first create a 3D point cloud in camera space at time $t$,

View File

@ -594,14 +594,14 @@ it is negative, let $s_i$ be the predicted objectness score and $b_i$, $b_i^*$ t
predicted and ground truth bounding box encodings.
Then, the RPN loss is computed as
\begin{equation}
\text{L}_{RPN} = \text{L}_{obj} + \text{L}_{box}^{RPN},
L_{RPN} = L_{obj} + L_{box}^{RPN},
\end{equation}
where
\begin{equation}
\text{L}_{obj} = \frac{1}{\text{N}_{RPN}} \sum_{i=1}^{\text{N}_{RPN}} \ell_{cls}(s_i, s_i^*),
L_{obj} = \frac{1}{\text{N}_{RPN}} \sum_{i=1}^{\text{N}_{RPN}} \ell_{cls}(s_i, s_i^*),
\end{equation}
\begin{equation}
\text{L}_{box}^{RPN} = \frac{1}{\text{N}_{RPN}^{pos}} \sum_{i=1}^{\text{N}_{RPN}} s_i^* \cdot \ell_{reg}(b_i^* - b_i),
L_{box}^{RPN} = \frac{1}{\text{N}_{RPN}^{pos}} \sum_{i=1}^{\text{N}_{RPN}} s_i^* \cdot \ell_{reg}(b_i^* - b_i),
\end{equation}
and
\begin{equation}
@ -630,19 +630,19 @@ In our implementation, we use nearest neighbour resizing for resizing the mask
targets.
Then, the ROI loss is computed as
\begin{equation}
\text{L}_{RoI} = \text{L}_{cls} + \text{L}_{box} + \text{L}_{mask}
L_{RoI} = L_{cls} + L_{box} + L_{mask}
\end{equation}
where
\begin{equation}
\text{L}_{cls} = \frac{1}{\text{N}_{RoI}} \sum_{i=1}^{\text{N}_{RoI}} \ell_{cls}(c_i, c_i^*),
L_{cls} = \frac{1}{\text{N}_{RoI}} \sum_{i=1}^{\text{N}_{RoI}} \ell_{cls}(c_i, c_i^*),
\end{equation}
is the average cross-entropy classification loss,
\begin{equation}
\text{L}_{box} = \frac{1}{\text{N}_{RoI}^{\mathit{fg}}} \sum_{i=1}^{\text{N}_{RoI}} [c_i^* \geq 1] \cdot \ell_{reg}(b_i^* - b_i)
L_{box} = \frac{1}{\text{N}_{RoI}^{\mathit{fg}}} \sum_{i=1}^{\text{N}_{RoI}} [c_i^* \geq 1] \cdot \ell_{reg}(b_i^* - b_i)
\end{equation}
is the average smooth-$\ell_1$ bounding box regression loss,
\begin{equation}
\text{L}_{mask} = \frac{1}{\text{N}_{RoI}^{\mathit{fg}}} \sum_{i=1}^{\text{N}_{RoI}} [c_i^* \geq 1] \cdot \ell_{cls*}(m_i,m_i^*)
L_{mask} = \frac{1}{\text{N}_{RoI}^{\mathit{fg}}} \sum_{i=1}^{\text{N}_{RoI}} [c_i^* \geq 1] \cdot \ell_{cls*}(m_i,m_i^*)
\end{equation}
is the average binary cross-entropy mask loss,
\begin{equation}

View File

View File

@ -1,5 +0,0 @@
[0] Config.pm:343> INFO - This is Biber 2.5
[0] Config.pm:346> INFO - Logfile is 'thesis.aux.blg'
[36] biber:290> INFO - === So Okt 29, 2017, 10:29:33
[108] Utils.pm:165> ERROR - Cannot find control file 'thesis.aux.bcf'! - did you pass the "backend=biber" option to BibLaTeX?
[108] Biber.pm:113> INFO - ERRORS: 1

View File

@ -3,14 +3,15 @@
11pt, % Schriftgröße
bibliography=totoc, % Bibliografie automatisch im Inhaltsverzeichnis generieren
parskip=off, % Absatzabstand: off, half, full
oneside, % einseitiges Layout
% twoside, % zweiseitiges Layout
% oneside, % einseitiges Layout
twoside, % zweiseitiges Layout
article, % verwendet KOMA-Klasse scrartcl
% longdoc=true,
accentcolor=tud2b,
accentcolor=tud1b,
% colorbacktitle, % Hintergrund des Titels einfärben
colorback, % Hintergrund unter dem Titel einfärben
type=bsc, % für Bachelorarbeit
type=bsc, % für Bachelorarbeit,
openright=true
]{tudthesis}
\usepackage[a-1b]{pdfx}
@ -37,7 +38,11 @@
% Leider funktioniert hyperref nicht 100%-ig einwandfrei mit tudstyle; bei Problemen damit lieber aus dem Dokument entfernen!
% Einstellungen für hyperref
\hypersetup{%
pdftitle=Motion R-CNN,
pdfinfo={
Title={Motion R-CNN: Instance-level 3D Motion Estimation with Region-based CNNs},
Author={Simon Meister},
}
pdftitle=Motion R-CNN: Instance-level 3D Motion Estimation with Region-based CNNs,
pdfauthor=Simon Meister,
pdfsubject=B.Sc. Thesis,
unicode=true, % benötigt, damit Umlaute im pdftitle richtig dargestellt werden
@ -48,6 +53,11 @@
%% citecolor=NavyBlue%DeepPink3
}
\pdfinfo{
/Title (Motion R-CNN: Instance-level 3D Motion Estimation with Region-based CNNs)
/Author (Simon Meister)
}
\usepackage[
backend=biber, % biber ist das Standard-Backend für Biblatex. Für die Abwärtskompatibilität kann hier auch bibtex oder bibtex8 gewählt werden (siehe biblatex-Dokumentation)
style=numeric, %numeric, authortitle, alphabetic etc.
@ -108,23 +118,26 @@
%\affidavit{\myname}
\pagestyle{myheadings} % Seitenstil umschalten
\pagenumbering{gobble}
%\pagenumbering{Roman} % Seitennummerierung auf römische Zahlen ändern
%\setcounter{page}{1}
%\mymarkright{Version: \today} % Inhalt der Fußzeile
\input{abstract}
\clearpage
%\pagenumbering{Roman} % Seitennummerierung auf römische Zahlen ändern
\setcounter{page}{1}
\pagenumbering{arabic} % Arabische Seitenzahlen
\setcounter{tocdepth}{3}
\tableofcontents
\setcounter{page}{1}
\clearpage
% Aktuelle Seitenzahl speichern, da Wechsel auf arabische Zahlen die Zählung zurücksetzt
%% \newcounter{savedromanpagenumber}
%% \setcounter{savedromanpagenumber}{\value{page}}
%\pagenumbering{arabic} % Arabische Seitenzahlen
\section{Introduction}
\label{sec:introduction}

View File

@ -1,112 +0,0 @@
% Replace the following information with your document's actual
% metadata. If you do not want to set a value for a certain parameter,
% just omit it.
%
% Symbols permitted in metadata
% =============================
%
% Within the metadata, all printable ASCII characters except
% '\', '{', '}', and '%' represent themselves. Also, all printable
% Unicode characters from the basic multilingual plane (i.e., up to
% code point U+FFFF) can be used directly with the UTF-8 encoding.
% Consecutive whitespace characters are combined into a single
% space. Whitespace after a macro such as \copyright, \backslash, or
% \sep is ignored. Blank lines are not permitted. Moreover, the
% following markup can be used:
%
% '\ ' - a literal space (for example after a macro)
% \% - a literal '%'
% \{ - a literal '{'
% \} - a literal '}'
% \backslash - a literal '\'
% \copyright - the (c) copyright symbol
%
% The macro \sep is only permitted within \Author, \Keywords, and
% \Org. It is used to separate multiple authors, keywords, etc.
%
% List of supported metadata fields
% =================================
%
% Here is a complete list of user-definable metadata fields currently
% supported, and their meanings. More may be added in the future.
%
% General information:
%
% \Author - the document's human author. Separate multiple
% authors with \sep.
% \Title - the document's title.
% \Keywords - list of keywords, separated with \sep.
% \Subject - the abstract.
% \Org - publishers.
%
% Copyright information:
%
% \Copyright - a copyright statement.
% \CopyrightURL - location of a web page describing the owner
% and/or rights statement for this document.
% \Copyrighted - 'True' if the document is copyrighted, and
% 'False' if it isn't. This is automatically set
% to 'True' if either \Copyright or \CopyrightURL
% is specified, but can be overridden. For
% example, if the copyright statement is "Public
% Domain", this should be set to 'False'.
%
% Publication information:
%
% \PublicationType - The type of publication. If defined, must be
% one of book, catalog, feed, journal, magazine,
% manual, newsletter, pamphlet. This is
% automatically set to "journal" if \Journaltitle
% is specified, but can be overridden.
% \Journaltitle - The title of the journal in which the document
% was published.
% \Journalnumber - The ISSN for the publication in which the
% document was published.
% \Volume - Journal volume.
% \Issue - Journal issue/number.
% \Firstpage - First page number of the published version of
% the document.
% \Lastpage - Last page number of the published version of
% the document.
% \Doi - Digital Object Identifier (DOI) for the
% document, without the leading "doi:".
% \CoverDisplayDate - Date on the cover of the journal issue, as a
% human-readable text string.
% \CoverDate - Date on the cover of the journal issue, in a
% format suitable for storing in a database field
% with a 'date' data type.
\Title{Motion R-CNN: Instance-level 3D Motion Estimation with Region-based CNNs}
\Author{Simon Meister}
\Copyright{Copyright \copyright\ 2017 "Simon Meister"}
\Keywords{optical flow\sep
instance segmentation\sep
deep learning}
\Subject{With the advent of deep learning, it has become popular to re-purpose generic deep networks for classical
computer vision problems involving pixel-wise estimation.
Following this trend, many recent end-to-end deep learning approaches to optical flow and scene flow
predict complete, high resolution flow fields with a generic network for dense, pixel-wise prediction,
thereby ignoring the inherent structure of the underlying motion estimation problem and any physical
constraints within the scene.
We introduce a scalable end-to-end deep learning approach for dense motion estimation that respects the
structure of the scene as being composed of distinct objects, thus combining the representation learning
benefits and speed of end-to-end deep networks with a physically plausible scene model inspired by
slanted plane energy-minimization approaches to scene flow.
Building on recent advances in region-based convolutional networks (R-CNNs), we integrate motion
estimation with instance segmentation. Given two consecutive frames from a monocular RGB-D camera,
our resulting end-to-end deep network detects objects with precise per-pixel object masks and estimates
the 3D motion of each detected object between the frames. By additionally estimating a global camera
motion in the same network, we compose a dense optical flow field based on instance-level and global
motion predictions. We train our network on the synthetic Virtual KITTI dataset, which provides ground
truth for all components of our system.}
\setRGBcolorprofile{sRGB_IEC61966-2-1_black_scaled.icc}
{sRGB_IEC61966-2-1_black_scaled}
{sRGB IEC61966 v2.1 with black scaling}
{http://www.color.org}