From d0d7e6b176a6c7393b00dc8b3632b09bcc613d61 Mon Sep 17 00:00:00 2001 From: Simon Meister Date: Wed, 22 Nov 2017 15:50:35 +0100 Subject: [PATCH] pre-final --- abstract.tex | 7 +--- approach.tex | 8 ++-- background.tex | 14 +++---- thesis.aux.bbl | 0 thesis.aux.blg | 5 --- thesis.tex | 29 +++++++++---- thesis.xmpdata | 112 ------------------------------------------------- 7 files changed, 34 insertions(+), 141 deletions(-) delete mode 100644 thesis.aux.bbl delete mode 100644 thesis.aux.blg delete mode 100644 thesis.xmpdata diff --git a/abstract.tex b/abstract.tex index 699a12a..7ece2e0 100644 --- a/abstract.tex +++ b/abstract.tex @@ -33,13 +33,10 @@ and compose a dense optical flow field based on instance-level and global motion predictions. We train our network on the synthetic Virtual KITTI dataset, which provides ground truth for all components of our system. -\end{abstract} - -\renewcommand{\abstractname}{Zusammenfassung} -\begin{abstract} +\subsection*{\textbf{Zusammenfassung}} Mit dem Aufkommen von Deep Learning -ist die Umfunktionierung generischer Deep Networks ein +ist das Umfunktionieren generischer Deep Networks ein beliebter Ansatz für klassische Probleme der Computer Vision geworden, die pixelweise Schätzung erfordern. diff --git a/approach.tex b/approach.tex index 00ee3ae..d3da89d 100644 --- a/approach.tex +++ b/approach.tex @@ -255,7 +255,7 @@ performs better in our case than the standard $\ell_1$-loss. We thus compute the RoI motion loss as \begin{equation} -\text{L}_{motion} = \frac{1}{\text{N}_{RoI}^{\mathit{fg}}} \sum_k^{\text{N}_{RoI}} l_{p}^k + (l_{R}^k + l_{t}^k) \cdot o_k^* + l_o^k, +L_{motion} = \frac{1}{\text{N}_{RoI}^{\mathit{fg}}} \sum_k^{\text{N}_{RoI}} l_{p}^k + (l_{R}^k + l_{t}^k) \cdot o_k^* + l_o^k, \end{equation} where \begin{equation} @@ -284,7 +284,7 @@ other than $c_k^*$ are not penalized. Now, our modified RoI loss is \begin{equation} -\text{L}_{RoI} = \text{L}_{cls} + \text{L}_{box} + \text{L}_{mask} + \text{L}_{motion}. +L_{RoI} = L_{cls} + L_{box} + L_{mask} + L_{motion}. \end{equation} \paragraph{Camera motion supervision} @@ -339,7 +339,7 @@ full image resolution, as the depth crops and 2D point grid are at the same resolution as the predicted $m \times m$ mask. -For each RoI, we can now compute $\text{L}_{RoI}$ and thus supervise the object motion +For each RoI, we can now compute $L_{RoI}$ and thus supervise the object motion by penalizing the $m \times m$ optical flow grid. If there is optical flow ground truth available, we can use the RoI bounding box to crop and resize a region from the ground truth optical flow to match the RoI's @@ -377,7 +377,7 @@ Again, as for masks and bounding boxes in Mask R-CNN, the predicted output object motions are the predicted object motions for the highest scoring class. -\subsection{Dense flow from motion} +\subsection{Dense flow from 3D motion} \label{ssec:postprocessing} As a postprocessing, we compose the dense optical flow between $I_t$ and $I_{t+1}$ from the outputs of our Motion R-CNN network. Given the depth map $d_t$ for frame $I_t$, we first create a 3D point cloud in camera space at time $t$, diff --git a/background.tex b/background.tex index 64a2b39..f11b970 100644 --- a/background.tex +++ b/background.tex @@ -594,14 +594,14 @@ it is negative, let $s_i$ be the predicted objectness score and $b_i$, $b_i^*$ t predicted and ground truth bounding box encodings. Then, the RPN loss is computed as \begin{equation} -\text{L}_{RPN} = \text{L}_{obj} + \text{L}_{box}^{RPN}, +L_{RPN} = L_{obj} + L_{box}^{RPN}, \end{equation} where \begin{equation} -\text{L}_{obj} = \frac{1}{\text{N}_{RPN}} \sum_{i=1}^{\text{N}_{RPN}} \ell_{cls}(s_i, s_i^*), +L_{obj} = \frac{1}{\text{N}_{RPN}} \sum_{i=1}^{\text{N}_{RPN}} \ell_{cls}(s_i, s_i^*), \end{equation} \begin{equation} -\text{L}_{box}^{RPN} = \frac{1}{\text{N}_{RPN}^{pos}} \sum_{i=1}^{\text{N}_{RPN}} s_i^* \cdot \ell_{reg}(b_i^* - b_i), +L_{box}^{RPN} = \frac{1}{\text{N}_{RPN}^{pos}} \sum_{i=1}^{\text{N}_{RPN}} s_i^* \cdot \ell_{reg}(b_i^* - b_i), \end{equation} and \begin{equation} @@ -630,19 +630,19 @@ In our implementation, we use nearest neighbour resizing for resizing the mask targets. Then, the ROI loss is computed as \begin{equation} -\text{L}_{RoI} = \text{L}_{cls} + \text{L}_{box} + \text{L}_{mask} +L_{RoI} = L_{cls} + L_{box} + L_{mask} \end{equation} where \begin{equation} -\text{L}_{cls} = \frac{1}{\text{N}_{RoI}} \sum_{i=1}^{\text{N}_{RoI}} \ell_{cls}(c_i, c_i^*), +L_{cls} = \frac{1}{\text{N}_{RoI}} \sum_{i=1}^{\text{N}_{RoI}} \ell_{cls}(c_i, c_i^*), \end{equation} is the average cross-entropy classification loss, \begin{equation} -\text{L}_{box} = \frac{1}{\text{N}_{RoI}^{\mathit{fg}}} \sum_{i=1}^{\text{N}_{RoI}} [c_i^* \geq 1] \cdot \ell_{reg}(b_i^* - b_i) +L_{box} = \frac{1}{\text{N}_{RoI}^{\mathit{fg}}} \sum_{i=1}^{\text{N}_{RoI}} [c_i^* \geq 1] \cdot \ell_{reg}(b_i^* - b_i) \end{equation} is the average smooth-$\ell_1$ bounding box regression loss, \begin{equation} -\text{L}_{mask} = \frac{1}{\text{N}_{RoI}^{\mathit{fg}}} \sum_{i=1}^{\text{N}_{RoI}} [c_i^* \geq 1] \cdot \ell_{cls*}(m_i,m_i^*) +L_{mask} = \frac{1}{\text{N}_{RoI}^{\mathit{fg}}} \sum_{i=1}^{\text{N}_{RoI}} [c_i^* \geq 1] \cdot \ell_{cls*}(m_i,m_i^*) \end{equation} is the average binary cross-entropy mask loss, \begin{equation} diff --git a/thesis.aux.bbl b/thesis.aux.bbl deleted file mode 100644 index e69de29..0000000 diff --git a/thesis.aux.blg b/thesis.aux.blg deleted file mode 100644 index 21041bd..0000000 --- a/thesis.aux.blg +++ /dev/null @@ -1,5 +0,0 @@ -[0] Config.pm:343> INFO - This is Biber 2.5 -[0] Config.pm:346> INFO - Logfile is 'thesis.aux.blg' -[36] biber:290> INFO - === So Okt 29, 2017, 10:29:33 -[108] Utils.pm:165> ERROR - Cannot find control file 'thesis.aux.bcf'! - did you pass the "backend=biber" option to BibLaTeX? -[108] Biber.pm:113> INFO - ERRORS: 1 diff --git a/thesis.tex b/thesis.tex index 8d5eb06..c8785e4 100644 --- a/thesis.tex +++ b/thesis.tex @@ -3,14 +3,15 @@ 11pt, % Schriftgröße bibliography=totoc, % Bibliografie automatisch im Inhaltsverzeichnis generieren parskip=off, % Absatzabstand: off, half, full - oneside, % einseitiges Layout -% twoside, % zweiseitiges Layout +% oneside, % einseitiges Layout + twoside, % zweiseitiges Layout article, % verwendet KOMA-Klasse scrartcl % longdoc=true, - accentcolor=tud2b, + accentcolor=tud1b, % colorbacktitle, % Hintergrund des Titels einfärben colorback, % Hintergrund unter dem Titel einfärben - type=bsc, % für Bachelorarbeit + type=bsc, % für Bachelorarbeit, + openright=true ]{tudthesis} \usepackage[a-1b]{pdfx} @@ -37,7 +38,11 @@ % Leider funktioniert hyperref nicht 100%-ig einwandfrei mit tudstyle; bei Problemen damit lieber aus dem Dokument entfernen! % Einstellungen für hyperref \hypersetup{% - pdftitle=Motion R-CNN, + pdfinfo={ + Title={Motion R-CNN: Instance-level 3D Motion Estimation with Region-based CNNs}, + Author={Simon Meister}, + } + pdftitle=Motion R-CNN: Instance-level 3D Motion Estimation with Region-based CNNs, pdfauthor=Simon Meister, pdfsubject=B.Sc. Thesis, unicode=true, % benötigt, damit Umlaute im pdftitle richtig dargestellt werden @@ -48,6 +53,11 @@ %% citecolor=NavyBlue%DeepPink3 } +\pdfinfo{ +/Title (Motion R-CNN: Instance-level 3D Motion Estimation with Region-based CNNs) +/Author (Simon Meister) +} + \usepackage[ backend=biber, % biber ist das Standard-Backend für Biblatex. Für die Abwärtskompatibilität kann hier auch bibtex oder bibtex8 gewählt werden (siehe biblatex-Dokumentation) style=numeric, %numeric, authortitle, alphabetic etc. @@ -108,23 +118,26 @@ %\affidavit{\myname} \pagestyle{myheadings} % Seitenstil umschalten +\pagenumbering{gobble} +%\pagenumbering{Roman} % Seitennummerierung auf römische Zahlen ändern +%\setcounter{page}{1} %\mymarkright{Version: \today} % Inhalt der Fußzeile \input{abstract} \clearpage -%\pagenumbering{Roman} % Seitennummerierung auf römische Zahlen ändern +\setcounter{page}{1} +\pagenumbering{arabic} % Arabische Seitenzahlen \setcounter{tocdepth}{3} \tableofcontents -\setcounter{page}{1} \clearpage % Aktuelle Seitenzahl speichern, da Wechsel auf arabische Zahlen die Zählung zurücksetzt %% \newcounter{savedromanpagenumber} %% \setcounter{savedromanpagenumber}{\value{page}} -%\pagenumbering{arabic} % Arabische Seitenzahlen + \section{Introduction} \label{sec:introduction} diff --git a/thesis.xmpdata b/thesis.xmpdata deleted file mode 100644 index ba6faa0..0000000 --- a/thesis.xmpdata +++ /dev/null @@ -1,112 +0,0 @@ -% Replace the following information with your document's actual -% metadata. If you do not want to set a value for a certain parameter, -% just omit it. -% -% Symbols permitted in metadata -% ============================= -% -% Within the metadata, all printable ASCII characters except -% '\', '{', '}', and '%' represent themselves. Also, all printable -% Unicode characters from the basic multilingual plane (i.e., up to -% code point U+FFFF) can be used directly with the UTF-8 encoding. -% Consecutive whitespace characters are combined into a single -% space. Whitespace after a macro such as \copyright, \backslash, or -% \sep is ignored. Blank lines are not permitted. Moreover, the -% following markup can be used: -% -% '\ ' - a literal space (for example after a macro) -% \% - a literal '%' -% \{ - a literal '{' -% \} - a literal '}' -% \backslash - a literal '\' -% \copyright - the (c) copyright symbol -% -% The macro \sep is only permitted within \Author, \Keywords, and -% \Org. It is used to separate multiple authors, keywords, etc. -% -% List of supported metadata fields -% ================================= -% -% Here is a complete list of user-definable metadata fields currently -% supported, and their meanings. More may be added in the future. -% -% General information: -% -% \Author - the document's human author. Separate multiple -% authors with \sep. -% \Title - the document's title. -% \Keywords - list of keywords, separated with \sep. -% \Subject - the abstract. -% \Org - publishers. -% -% Copyright information: -% -% \Copyright - a copyright statement. -% \CopyrightURL - location of a web page describing the owner -% and/or rights statement for this document. -% \Copyrighted - 'True' if the document is copyrighted, and -% 'False' if it isn't. This is automatically set -% to 'True' if either \Copyright or \CopyrightURL -% is specified, but can be overridden. For -% example, if the copyright statement is "Public -% Domain", this should be set to 'False'. -% -% Publication information: -% -% \PublicationType - The type of publication. If defined, must be -% one of book, catalog, feed, journal, magazine, -% manual, newsletter, pamphlet. This is -% automatically set to "journal" if \Journaltitle -% is specified, but can be overridden. -% \Journaltitle - The title of the journal in which the document -% was published. -% \Journalnumber - The ISSN for the publication in which the -% document was published. -% \Volume - Journal volume. -% \Issue - Journal issue/number. -% \Firstpage - First page number of the published version of -% the document. -% \Lastpage - Last page number of the published version of -% the document. -% \Doi - Digital Object Identifier (DOI) for the -% document, without the leading "doi:". -% \CoverDisplayDate - Date on the cover of the journal issue, as a -% human-readable text string. -% \CoverDate - Date on the cover of the journal issue, in a -% format suitable for storing in a database field -% with a 'date' data type. - - - -\Title{Motion R-CNN: Instance-level 3D Motion Estimation with Region-based CNNs} - -\Author{Simon Meister} - -\Copyright{Copyright \copyright\ 2017 "Simon Meister"} - -\Keywords{optical flow\sep - instance segmentation\sep - deep learning} - -\Subject{With the advent of deep learning, it has become popular to re-purpose generic deep networks for classical -computer vision problems involving pixel-wise estimation. -Following this trend, many recent end-to-end deep learning approaches to optical flow and scene flow -predict complete, high resolution flow fields with a generic network for dense, pixel-wise prediction, -thereby ignoring the inherent structure of the underlying motion estimation problem and any physical -constraints within the scene. -We introduce a scalable end-to-end deep learning approach for dense motion estimation that respects the -structure of the scene as being composed of distinct objects, thus combining the representation learning -benefits and speed of end-to-end deep networks with a physically plausible scene model inspired by -slanted plane energy-minimization approaches to scene flow. -Building on recent advances in region-based convolutional networks (R-CNNs), we integrate motion -estimation with instance segmentation. Given two consecutive frames from a monocular RGB-D camera, -our resulting end-to-end deep network detects objects with precise per-pixel object masks and estimates -the 3D motion of each detected object between the frames. By additionally estimating a global camera -motion in the same network, we compose a dense optical flow field based on instance-level and global -motion predictions. We train our network on the synthetic Virtual KITTI dataset, which provides ground -truth for all components of our system.} - -\setRGBcolorprofile{sRGB_IEC61966-2-1_black_scaled.icc} -{sRGB_IEC61966-2-1_black_scaled} -{sRGB IEC61966 v2.1 with black scaling} -{http://www.color.org}