mirror of
https://github.com/tu-darmstadt-informatik/bsc-thesis.git
synced 2025-12-12 17:35:51 +00:00
complete references
This commit is contained in:
parent
7c9344a913
commit
b895fa9f18
2
.gitignore
vendored
2
.gitignore
vendored
@ -6,8 +6,6 @@ thesis.brf
|
||||
thesis.pdf
|
||||
thesis.synctex.gz
|
||||
thesis.log
|
||||
external.bib
|
||||
papers.bib
|
||||
short.bib
|
||||
thesis.dvi
|
||||
q.log
|
||||
|
||||
@ -28,8 +28,8 @@ we integrate motion estimation with instance segmentation.
|
||||
Given two consecutive frames from a monocular RGB-D camera,
|
||||
our resulting end-to-end deep network detects objects with precise per-pixel
|
||||
object masks and estimates the 3D motion of each detected object between the frames.
|
||||
By additionally estimating the camera ego-motion in the same network,
|
||||
we compose a dense optical flow field based on instance-level and global motion
|
||||
Additionally, we estimate the camera ego-motion in the same network,
|
||||
and compose a dense optical flow field based on instance-level and global motion
|
||||
predictions. We train our network on the synthetic Virtual KITTI dataset,
|
||||
which provides ground truth for all components of our system.
|
||||
|
||||
@ -62,8 +62,8 @@ Networks (R-CNNs) auf und integrieren Bewegungsschätzung mit Instanzsegmentieru
|
||||
Bei Eingabe von zwei aufeinanderfolgenden Frames aus einer monokularen RGB-D
|
||||
Kamera erkennt unser end-to-end Deep Network Objekte mit pixelgenauen Objektmasken
|
||||
und schätzt die 3D-Bewegung jedes erkannten Objekts zwischen den Frames ab.
|
||||
Indem wir zusätzlich im selben Netzwerk die Eigenbewerung der Kamera schätzen,
|
||||
setzen wir aus den instanzbasierten und globalen Bewegungsschätzungen ein dichtes
|
||||
Zusätzlich schätzen wir im selben Netzwerk die Eigenbewegung der Kamera,
|
||||
und setzen aus den instanzbasierten und globalen Bewegungsschätzungen ein dichtes
|
||||
optisches Flussfeld zusammen.
|
||||
Wir trainieren unser Netzwerk auf dem synthetischen Virtual KITTI Datensatz,
|
||||
der Ground Truth für alle Komponenten unseres Systems bereitstellt.
|
||||
|
||||
@ -255,7 +255,7 @@ performs better in our case than the standard $\ell_1$-loss.
|
||||
We thus compute the RoI motion loss as
|
||||
|
||||
\begin{equation}
|
||||
L_{motion} = \frac{1}{N_{RoI}^{fg}} \sum_k^{N_{RoI}} l_{p}^k + (l_{R}^k + l_{t}^k) \cdot o_k^* + l_o^k,
|
||||
\text{L}_{motion} = \frac{1}{\text{N}_{RoI}^{\mathit{fg}}} \sum_k^{\text{N}_{RoI}} l_{p}^k + (l_{R}^k + l_{t}^k) \cdot o_k^* + l_o^k,
|
||||
\end{equation}
|
||||
where
|
||||
\begin{equation}
|
||||
@ -284,7 +284,7 @@ other than $c_k^*$ are not penalized.
|
||||
|
||||
Now, our modified RoI loss is
|
||||
\begin{equation}
|
||||
L_{RoI} = L_{cls} + L_{box} + L_{mask} + L_{motion}.
|
||||
\text{L}_{RoI} = \text{L}_{cls} + \text{L}_{box} + \text{L}_{mask} + \text{L}_{motion}.
|
||||
\end{equation}
|
||||
|
||||
\paragraph{Camera motion supervision}
|
||||
@ -339,7 +339,7 @@ full image resolution, as
|
||||
the depth crops and 2D point grid are at the same resolution as the predicted
|
||||
$m \times m$ mask.
|
||||
|
||||
For each RoI, we can now compute $L_{RoI}$ and thus supervise the object motion
|
||||
For each RoI, we can now compute $\text{L}_{RoI}$ and thus supervise the object motion
|
||||
by penalizing the $m \times m$ optical flow grid.
|
||||
If there is optical flow ground truth available, we can use the RoI bounding box to
|
||||
crop and resize a region from the ground truth optical flow to match the RoI's
|
||||
|
||||
@ -105,7 +105,7 @@ image brightness differences penalizes the predictions.
|
||||
& Conv-Deconv & H $\times$ W $\times$ 32 \\
|
||||
masks & 1 $\times$1 conv, N$_{motions}$ & H $\times$ W $\times$ N$_{motions}$ \\
|
||||
FC & From bottleneck: $\begin{bmatrix}\textrm{fully connected}, 512\end{bmatrix}$ $\times$ 2 & 1 $\times$ 512 \\
|
||||
object motions & fully connected, $N_{motions} \cdot$ 9 & H $\times$ W $\times$ $N_{motions} \cdot$ 9 \\
|
||||
object motions & fully connected, $\text{N}_{motions} \cdot$ 9 & H $\times$ W $\times$ $\text{N}_{motions} \cdot$ 9 \\
|
||||
camera motion & From FC: $\times$ 2 & H $\times$ W $\times$ 6 \\
|
||||
\midrule
|
||||
\multicolumn{3}{c}{\textbf{Structure Network}}\\
|
||||
@ -276,10 +276,10 @@ C$_4$ & ResNet \{up to C$_4$\} (Table \ref{table:resnet}) & $\tfrac{1}{16}$ H $
|
||||
\multicolumn{3}{c}{\textbf{Region Proposal Network (RPN)}}\\
|
||||
\midrule
|
||||
R$_0$ & From C$_4$: 1 $\times$ 1 conv, 512 & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ 512 \\
|
||||
& 1 $\times$ 1 conv, 4 & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ $N_a \cdot$ 4 \\
|
||||
& 1 $\times$ 1 conv, 4 & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ $\text{N}_a \cdot$ 4 \\
|
||||
& flatten & A $\times$ 4 \\
|
||||
boxes$_{\mathrm{RPN}}$ & decode bounding boxes (Eq. \ref{eq:pred_bounding_box}) & A $\times$ 4\\
|
||||
& From R$_0$: 1 $\times$ 1 conv, 2 & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ $N_a \cdot$ 2 \\
|
||||
& From R$_0$: 1 $\times$ 1 conv, 2 & $\tfrac{1}{16}$ H $\times$ $\tfrac{1}{16}$ W $\times$ $\text{N}_a \cdot$ 2 \\
|
||||
& flatten & A $\times$ 2 \\
|
||||
scores$_{\mathrm{RPN}}$& softmax & A $\times$ 2 \\
|
||||
ROI$_{\mathrm{RPN}}$ & sample boxes$_{\mathrm{RPN}}$ and scores$_{\mathrm{RPN}}$ & N$_{RoI}$ $\times$ 6 \\
|
||||
@ -326,9 +326,9 @@ which is a deep feature encoder CNN with the original image as input.
|
||||
Next, the \emph{backbone} output features are passed into a small, fully-convolutional \emph{Region Proposal Network (RPN)} head, which
|
||||
predicts objectness scores and regresses bounding boxes at each of its output positions.
|
||||
At any of the $h \times w$ output positions of the RPN head,
|
||||
$N_a$ bounding boxes with their objectness scores are predicted as offsets relative to a fixed set of $N_a$ \emph{anchors} with different
|
||||
aspect ratios and scales. Thus, there are $N_a \times h \times w$ reference anchors in total.
|
||||
In Faster R-CNN, $N_a = 9$, with 3 scales, corresponding
|
||||
$\text{N}_a$ bounding boxes with their objectness scores are predicted as offsets relative to a fixed set of $\text{N}_a$ \emph{anchors} with different
|
||||
aspect ratios and scales. Thus, there are $\text{N}_a \times h \times w$ reference anchors in total.
|
||||
In Faster R-CNN, $\text{N}_a = 9$, with 3 scales, corresponding
|
||||
to anchor boxes of areas of $\{128^2, 256^2, 512^2\}$ pixels and 3 aspect ratios,
|
||||
$\{1:2, 1:1, 2:1\}$. For the ResNet Faster R-CNN backbone, we generally have a stride of 16
|
||||
with respect to the input image at the RPN output (Table \ref{table:maskrcnn_resnet}).
|
||||
@ -388,7 +388,7 @@ P$_6$ & From P$_5$: 2 $\times$ 2 subsample, 256 & $\tfrac{1}{64}$ H $\times$ $\t
|
||||
\midrule
|
||||
\multicolumn{3}{c}{$\forall i \in \{2...6\}$}\\
|
||||
& From P$_i$: 1 $\times$ 1 conv, 512 & $\tfrac{1}{2^i}$ H $\times$ $\tfrac{1}{2^i}$ W $\times$ 512 \\
|
||||
& 1 $\times$ 1 conv, 6 & $\tfrac{1}{2^i}$ H $\times$ $\tfrac{1}{2^i}$ W $\times$ $N_a \cdot$ 6 \\
|
||||
& 1 $\times$ 1 conv, 6 & $\tfrac{1}{2^i}$ H $\times$ $\tfrac{1}{2^i}$ W $\times$ $\text{N}_a \cdot$ 6 \\
|
||||
RPN$_i$& flatten & A$_i$ $\times$ 6 \\
|
||||
\midrule
|
||||
& From \{RPN$_2$ ... RPN$_6$\}: concatenate & A $\times$ 6 \\
|
||||
@ -440,7 +440,7 @@ from the bottleneck.
|
||||
Instead of a single RPN head with anchors at 3 scales and 3 aspect ratios,
|
||||
the FPN variant has one RPN head after each of the pyramid levels P$_2$ ... P$_6$ (see Table \ref{table:maskrcnn_resnet_fpn}).
|
||||
At each output position of the resulting RPN pyramid, bounding boxes are predicted
|
||||
with respect to 3 anchor aspect ratios $\{1:2, 1:1, 2:1\}$ and a single scale ($N_a = 3$).
|
||||
with respect to 3 anchor aspect ratios $\{1:2, 1:1, 2:1\}$ and a single scale ($\text{N}_a = 3$).
|
||||
For P$_2$, P$_3$, P$_4$, P$_5$, P$_6$,
|
||||
the scale corresponds to anchor bounding boxes of areas $32^2, 64^2, 128^2, 256^2, 512^2$,
|
||||
respectively.
|
||||
@ -501,14 +501,21 @@ frequently in the following chapters. For vector or tuple arguments, the sum of
|
||||
losses is computed.
|
||||
For classification with mutually exclusive classes, we define the categorical (softmax) cross-entropy loss,
|
||||
\begin{equation}
|
||||
\ell_{cls}(c, c^*) = -\log(),
|
||||
\ell_{cls}(c, c^*) = -\log(c_{c^*}),
|
||||
\end{equation}
|
||||
where $c^* \in \{0,N_{cls}\}$ is a label (or vector of labels) and $c \in (0,1)$ is the output of a softmax layer. % TODO label has wrong range
|
||||
where $c^* \in \{0,\text{C}\}$ is a ground truth label,
|
||||
$c$ is the output vector from a softmax layer,
|
||||
$c_{c^*} \in (0,1)$ is the output probability for class $c^*$,
|
||||
and $\text{C}$ is the number of classes.
|
||||
Note that for the object category classifier, $\text{C} = \text{N}_{cls} + 1$,
|
||||
as $\text{N}_{cls}$ does not include the background class.
|
||||
Finally, for multi-label classification, we define the binary (sigmoid) cross-entropy loss,
|
||||
\begin{equation}
|
||||
\ell_{cls*}(y, y^*) = -y^* \cdot \log(y) - (1 - y^*) \cdot \log(1 - y),
|
||||
\end{equation}
|
||||
where $y^* \in \{0,1\}$ is a label (or vector of labels) and $y \in (0,1)$ is the output of a sigmoid layer.
|
||||
where $y^* \in \{0,1\}$ is a label and $y \in (0,1)$ is the output from a sigmoid layer.
|
||||
Note that for the mask loss that will be introduced below, $\ell_{cls*}$ is
|
||||
the sum of the $\ell_{cls*}$-losses for all 2D positions in the mask.
|
||||
|
||||
\label{ssec:rcnn_techn}
|
||||
\paragraph{Bounding box regression}
|
||||
@ -575,7 +582,7 @@ the predicted relative offsets and scales encoded in $b_e$.
|
||||
|
||||
\paragraph{Supervision of the RPN}
|
||||
A positive RPN proposal is defined as one with a IoU of at least $0.7$ with
|
||||
a ground truth bounding box. For training the RPN, $N_{RPN} = 256$ positive and negative
|
||||
a ground truth bounding box. For training the RPN, $\text{N}_{RPN} = 256$ positive and negative
|
||||
examples are randomly sampled from the set of all RPN proposals,
|
||||
with at most $50\%$ positive examples (if there are less positive examples,
|
||||
more negative examples are used instead).
|
||||
@ -587,18 +594,18 @@ it is negative, let $s_i$ be the predicted objectness score and $b_i$, $b_i^*$ t
|
||||
predicted and ground truth bounding box encodings.
|
||||
Then, the RPN loss is computed as
|
||||
\begin{equation}
|
||||
L_{RPN} = L_{obj} + L_{box}^{RPN},
|
||||
\text{L}_{RPN} = \text{L}_{obj} + \text{L}_{box}^{RPN},
|
||||
\end{equation}
|
||||
where
|
||||
\begin{equation}
|
||||
L_{obj} = \frac{1}{N_{RPN}} \sum_{i=1}^{N_{RPN}} \ell_{cls}(s_i, s_i^*),
|
||||
\text{L}_{obj} = \frac{1}{\text{N}_{RPN}} \sum_{i=1}^{\text{N}_{RPN}} \ell_{cls}(s_i, s_i^*),
|
||||
\end{equation}
|
||||
\begin{equation}
|
||||
L_{box}^{RPN} = \frac{1}{N_{RPN}^{pos}} \sum_{i=1}^{N_{RPN}} s_i^* \cdot \ell_{reg}(b_i^* - b_i),
|
||||
\text{L}_{box}^{RPN} = \frac{1}{\text{N}_{RPN}^{pos}} \sum_{i=1}^{\text{N}_{RPN}} s_i^* \cdot \ell_{reg}(b_i^* - b_i),
|
||||
\end{equation}
|
||||
and
|
||||
\begin{equation}
|
||||
N_{RPN}^{pos} = \sum_{i=1}^{N_{RPN}} s_i^*
|
||||
\text{N}_{RPN}^{pos} = \sum_{i=1}^{\text{N}_{RPN}} s_i^*
|
||||
\end{equation}
|
||||
is the number of positive examples. Note that the bounding box loss is only
|
||||
active for positive examples, and that the classification loss is computed
|
||||
@ -612,7 +619,7 @@ one with a maximum IoU in $[0.1, 0.5)$.
|
||||
A total of 64 (without FPN) or 512 (with FPN) RoIs are sampled, with
|
||||
at most $25\%$ foreground examples.
|
||||
Now, let $c_i^*$ be the ground truth object class, where $c_i = 0$
|
||||
for background examples and $c_i \in \{1, ..., N_{cls}\}$ for foreground examples,
|
||||
for background examples and $c_i \in \{1, ..., \text{N}_{cls}\}$ for foreground examples,
|
||||
and let $c_i$ be the class prediction.
|
||||
Then, for any foreground RoI, let $b_i^*$ be the ground truth bounding box encoding and $b_i$
|
||||
the predicted refined box encoding for class $c_i^*$.
|
||||
@ -623,23 +630,23 @@ In our implementation, we use nearest neighbour resizing for resizing the mask
|
||||
targets.
|
||||
Then, the ROI loss is computed as
|
||||
\begin{equation}
|
||||
L_{RoI} = L_{cls} + L_{box} + L_{mask}
|
||||
\text{L}_{RoI} = \text{L}_{cls} + \text{L}_{box} + \text{L}_{mask}
|
||||
\end{equation}
|
||||
where
|
||||
\begin{equation}
|
||||
L_{cls} = \frac{1}{N_{RoI}} \sum_{i=1}^{N_{RoI}} \ell_{cls}(c_i, c_i^*),
|
||||
\text{L}_{cls} = \frac{1}{\text{N}_{RoI}} \sum_{i=1}^{\text{N}_{RoI}} \ell_{cls}(c_i, c_i^*),
|
||||
\end{equation}
|
||||
is the average cross-entropy classification loss,
|
||||
\begin{equation}
|
||||
L_{box} = \frac{1}{N_{RoI}^{fg}} \sum_{i=1}^{N_{RoI}} [c_i^* \geq 1] \cdot \ell_{reg}(b_i^* - b_i)
|
||||
\text{L}_{box} = \frac{1}{\text{N}_{RoI}^{\mathit{fg}}} \sum_{i=1}^{\text{N}_{RoI}} [c_i^* \geq 1] \cdot \ell_{reg}(b_i^* - b_i)
|
||||
\end{equation}
|
||||
is the average smooth-$\ell_1$ bounding box regression loss,
|
||||
\begin{equation}
|
||||
L_{mask} = \frac{1}{N_{RoI}^{fg}} \sum_{i=1}^{N_{RoI}} [c_i^* \geq 1] \cdot \ell_{cls*}(m_i,m_i^*)
|
||||
\text{L}_{mask} = \frac{1}{\text{N}_{RoI}^{\mathit{fg}}} \sum_{i=1}^{\text{N}_{RoI}} [c_i^* \geq 1] \cdot \ell_{cls*}(m_i,m_i^*)
|
||||
\end{equation}
|
||||
is the average binary cross-entropy mask loss,
|
||||
\begin{equation}
|
||||
N_{RoI}^{fg} = \sum_{i=1}^{N_{RoI}} [c_i^* \geq 1]
|
||||
\text{N}_{RoI}^{\mathit{fg}} = \sum_{i=1}^{\text{N}_{RoI}} [c_i^* \geq 1]
|
||||
\end{equation}
|
||||
is the number of foreground examples, and
|
||||
\begin{equation}
|
||||
|
||||
155
bib.bib
155
bib.bib
@ -4,15 +4,19 @@
|
||||
Vladimir Golkov and Patrick v.d. Smagt and Daniel Cremers and Thomas Brox},
|
||||
title = {{FlowNet}: Learning Optical Flow with Convolutional Networks},
|
||||
booktitle = iccv-2015,
|
||||
Pages = {2758--2766},
|
||||
year = iccv-2015-yr,
|
||||
address = iccv-2015-adr,
|
||||
month = iccv-2015-mon}
|
||||
|
||||
@inproceedings{FlowNet2,
|
||||
author = {Eddy Ilg and Nikolaus Mayer and Tonmoy Saikia and
|
||||
Margret Keuper and Alexey Dosovitskiy and Thomas Brox},
|
||||
title = {{FlowNet} 2.0: {E}volution of Optical Flow Estimation with Deep Networks},
|
||||
Pages = {1647--1655},
|
||||
booktitle = cvpr-2017,
|
||||
year = cvpr-2017-yr,
|
||||
address = cvpr-2017-adr,
|
||||
month = cvpr-2017-mon}
|
||||
|
||||
@inproceedings{SceneFlowDataset,
|
||||
@ -21,7 +25,9 @@
|
||||
title = {A Large Dataset to Train Convolutional Networks for
|
||||
Disparity, Optical Flow, and Scene Flow Estimation},
|
||||
booktitle = cvpr-2016,
|
||||
Pages = {4040--4048},
|
||||
year = cvpr-2016-yr,
|
||||
address = cvpr-2016-adr,
|
||||
month = cvpr-2016-mon}
|
||||
|
||||
@article{SfmNet,
|
||||
@ -37,9 +43,10 @@
|
||||
@inproceedings{MaskRCNN,
|
||||
Author = {Kaiming He and Georgia Gkioxari and
|
||||
Piotr Doll\'{a}r and Ross Girshick},
|
||||
Title = {{Mask {R-CNN}}},
|
||||
Title = {Mask {R-CNN}},
|
||||
Booktitle = cvpr-2017,
|
||||
Year = cvpr-2017-yr,
|
||||
address = cvpr-2017-adr,
|
||||
month = cvpr-2017-mon}
|
||||
|
||||
@inproceedings{FasterRCNN,
|
||||
@ -47,14 +54,20 @@
|
||||
Ross Girshick and Jian Sun},
|
||||
Title = {Faster {R-CNN}: Towards Real-Time Object Detection
|
||||
with Region Proposal Networks},
|
||||
Booktitle = nips-2015,
|
||||
Year = nips-2015-yr}
|
||||
booktitle = nips-2015,
|
||||
year = nips-2015-yr,
|
||||
Editor = nips-2015-eds,
|
||||
pages = {91--99},
|
||||
Pages = {2017--2025},
|
||||
Volume = nips-2015-vol}
|
||||
|
||||
@inproceedings{FastRCNN,
|
||||
Author = {Ross Girshick},
|
||||
Title = {Fast {R-CNN}},
|
||||
Booktitle = iccv-2015,
|
||||
pages = {1440--1448},
|
||||
year = iccv-2015-yr,
|
||||
address = iccv-2015-adr,
|
||||
month = iccv-2015-mon}
|
||||
|
||||
@inproceedings{InstanceSceneFlow,
|
||||
@ -65,6 +78,7 @@
|
||||
in Autonomous Driving Scenarios?},
|
||||
Booktitle = iccv-2017,
|
||||
year = iccv-2017-yr,
|
||||
address = iccv-2017-adr,
|
||||
month = iccv-2017-mon}
|
||||
|
||||
@inproceedings{RCNN,
|
||||
@ -76,12 +90,17 @@
|
||||
object detection and semantic segmentation},
|
||||
Booktitle = cvpr-2014,
|
||||
Year = cvpr-2014-yr,
|
||||
address = cvpr-2014-adr,
|
||||
pages = {580--587},
|
||||
month = cvpr-2014-mon}
|
||||
|
||||
@inproceedings{ImageNetCNN,
|
||||
title = {ImageNet Classification with Deep Convolutional Neural Networks},
|
||||
author = {Alex Krizhevsky and Sutskever, Ilya and Hinton, Geoffrey E.},
|
||||
booktitle = nips-2012,
|
||||
Editor = nips-2012-eds,
|
||||
Volume = nips-2012-vol,
|
||||
pages = {1097--1105},
|
||||
year = nips-2012-yr}
|
||||
|
||||
@inproceedings{VGGNet,
|
||||
@ -95,18 +114,24 @@
|
||||
title = {Deep Residual Learning for Image Recognition},
|
||||
booktitle = cvpr-2016,
|
||||
year = cvpr-2016-yr,
|
||||
pages = {770--778},
|
||||
address = cvpr-2016-adr,
|
||||
month = cvpr-2016-mon}
|
||||
|
||||
@inproceedings{DenseNetDenseFlow,
|
||||
author = {Yi Zhu and Shawn D. Newsam},
|
||||
title = {DenseNet for Dense Flow},
|
||||
booktitle = icip-2017,
|
||||
month = icip-2017-mon,
|
||||
address = icip-2017-adr,
|
||||
year = icip-2017-yr}
|
||||
|
||||
@inproceedings{SE3Nets,
|
||||
author = {Arunkumar Byravan and Dieter Fox},
|
||||
title = {{SE3-Nets}: Learning Rigid Body Motion using Deep Neural Networks},
|
||||
booktitle = {Proceedings of the IEEE International Conference on Robotics and Automation},
|
||||
pages = {173--180},
|
||||
address = "Singapore, Singapore",
|
||||
year = {2017}}
|
||||
|
||||
@inproceedings{FlowLayers,
|
||||
@ -114,19 +139,29 @@
|
||||
title = {Optical Flow with Semantic Segmentation and Localized Layers},
|
||||
booktitle = cvpr-2016,
|
||||
year = cvpr-2016-yr,
|
||||
pages = {3889--3898},
|
||||
address = cvpr-2016-adr,
|
||||
month = cvpr-2016-mon}
|
||||
|
||||
@inproceedings{ESI,
|
||||
author = {Min Bai and Wenjie Luo and Kaustav Kundu and Raquel Urtasun},
|
||||
title = {Exploiting Semantic Information and Deep Matching for Optical Flow},
|
||||
booktitle = eccv-2016,
|
||||
year = eccv-2016-yr}
|
||||
year = eccv-2016-yr,
|
||||
Series = eccv-2016-ser,
|
||||
Editor = eccv-2016-eds,
|
||||
Pages = {154--170},
|
||||
Publisher = eccv-2016-pub,
|
||||
Volume = eccv-2016-vol6,
|
||||
Sortmonth = eccv-2016-srtmon}
|
||||
|
||||
@inproceedings{VKITTI,
|
||||
author = {Adrien Gaidon and Qiao Wang and Yohann Cabon and Eleonora Vig},
|
||||
title = {Virtual Worlds as Proxy for Multi-Object Tracking Analysis},
|
||||
booktitle = cvpr-2016,
|
||||
year = cvpr-2016-yr,
|
||||
address = cvpr-2016-adr,
|
||||
pages = {4340--4349},
|
||||
month = cvpr-2016-mon}
|
||||
|
||||
@inproceedings{KITTI2012,
|
||||
@ -134,26 +169,36 @@
|
||||
title = {Are we ready for Autonomous Driving? The {KITTI} Vision Benchmark Suite},
|
||||
booktitle = cvpr-2012,
|
||||
year = cvpr-2012-yr,
|
||||
pages = {3354--3361},
|
||||
address = cvpr-2012-adr,
|
||||
month = cvpr-2012-mon}
|
||||
|
||||
@inproceedings{KITTI2015,
|
||||
author = {Moritz Menze and Andreas Geiger},
|
||||
title = {Object Scene Flow for Autonomous Vehicles},
|
||||
booktitle = cvpr-2015,
|
||||
pages = {3061--3070},
|
||||
year = cvpr-2015-yr,
|
||||
address = cvpr-2015-adr,
|
||||
month = cvpr-2015-mon}
|
||||
|
||||
@inproceedings{PRSF,
|
||||
author = {C. Vogel and K. Schindler and S. Roth},
|
||||
title = {Piecewise Rigid Scene Flow},
|
||||
booktitle = iccv-2013,
|
||||
address = iccv-2013-adr,
|
||||
year = iccv-2013-yr,
|
||||
Pages = {1377--1384},
|
||||
month = iccv-2013-mon}
|
||||
|
||||
@article{PRSM,
|
||||
author = {C. Vogel and K. Schindler and S. Roth},
|
||||
title = {3D Scene Flow with a Piecewise Rigid Scene Model},
|
||||
booktitle = ijcv,
|
||||
title = {{3D} Scene Flow with a Piecewise Rigid Scene Model},
|
||||
journal = ijcv,
|
||||
Month = oct,
|
||||
Number = {1},
|
||||
Pages = {1--28},
|
||||
Volume = {115},
|
||||
year = {2015}}
|
||||
|
||||
@inproceedings{MRFlow,
|
||||
@ -161,6 +206,8 @@
|
||||
title = {Optical Flow in Mostly Rigid Scenes},
|
||||
booktitle = cvpr-2017,
|
||||
year = cvpr-2017-yr,
|
||||
address = cvpr-2017-adr,
|
||||
Pages = {6911--6920},
|
||||
month = cvpr-2017-mon}
|
||||
|
||||
@inproceedings{SPyNet,
|
||||
@ -168,6 +215,8 @@
|
||||
title = {Optical Flow Estimation using a Spatial Pyramid Network},
|
||||
booktitle = cvpr-2017,
|
||||
year = cvpr-2017-yr,
|
||||
address = cvpr-2017-adr,
|
||||
Pages = {2720--2729},
|
||||
month = cvpr-2017-mon}
|
||||
|
||||
@inproceedings{FPN,
|
||||
@ -175,6 +224,8 @@
|
||||
title = {Feature Pyramid Networks for Object Detection},
|
||||
booktitle = cvpr-2017,
|
||||
year = cvpr-2017-yr,
|
||||
Pages = {936--944},
|
||||
address = cvpr-2017-adr,
|
||||
month = cvpr-2017-mon}
|
||||
|
||||
@inproceedings{PoseNet,
|
||||
@ -182,6 +233,8 @@
|
||||
title = {PoseNet: A Convolutional Network for Real-Time 6-DOF Camera Relocalization},
|
||||
booktitle = iccv-2015,
|
||||
year = iccv-2015-yr,
|
||||
Pages = {2938--2946},
|
||||
address = iccv-2015-adr,
|
||||
month = iccv-2015-mon}
|
||||
|
||||
@inproceedings{PoseNet2,
|
||||
@ -189,24 +242,36 @@
|
||||
title = {Geometric loss functions for camera pose regression with deep learning},
|
||||
booktitle = cvpr-2017,
|
||||
year = cvpr-2017-yr,
|
||||
address = cvpr-2017-adr,
|
||||
Pages = {6555--6564},
|
||||
month = cvpr-2017-mon}
|
||||
|
||||
@inproceedings{STN,
|
||||
author = {M. Jadeberg and K. Zisserman and K. Kavukcuoglu},
|
||||
title = {Spatial transformer networks},
|
||||
booktitle = nips-2015,
|
||||
year = nips-2015-yr}
|
||||
year = nips-2015-yr,
|
||||
Editor = nips-2015-eds,
|
||||
Pages = {2017--2025},
|
||||
Volume = nips-2015-vol}
|
||||
|
||||
@inproceedings{CensusTerm,
|
||||
author = {Fridtjof Stein},
|
||||
title = {Efficient Computation of Optical Flow Using the Census Transform},
|
||||
booktitle = dagm-2004,
|
||||
year = dagm-2004-yr}
|
||||
pages={79--86},
|
||||
year = dagm-2004-yr,
|
||||
volume = dagm-2004-vol,
|
||||
series = dagm-2004-ser,
|
||||
publisher = dagm-2004-pub,
|
||||
editor = dagm-2004-eds
|
||||
}
|
||||
|
||||
@inproceedings{DeeperDepth,
|
||||
author = {Iro Laina and Christian Rupprecht and Vasileios Belagiannis and Federico Tombari and Nassir Navab},
|
||||
title = {Deeper Depth Prediction with Fully Convolutional Residual Networks},
|
||||
booktitle = {Proceedings of the International Conference on 3D Vision},
|
||||
pages={239--248},
|
||||
year = {2016}}
|
||||
|
||||
@inproceedings{TensorFlowObjectDetection,
|
||||
@ -215,6 +280,8 @@
|
||||
title = {Speed/accuracy trade-offs for modern convolutional object detectors},
|
||||
booktitle = cvpr-2017,
|
||||
year = cvpr-2017-yr,
|
||||
address = cvpr-2017-adr,
|
||||
Pages = {3296--3297},
|
||||
month = cvpr-2017-mon}
|
||||
|
||||
@misc{TensorFlow,
|
||||
@ -224,45 +291,64 @@
|
||||
author={Martín Abadi and others},
|
||||
year={2015}}
|
||||
|
||||
@inproceedings{LSTM,
|
||||
@article{LSTM,
|
||||
author = {Sepp Hochreiter and Jürgen Schmidhuber},
|
||||
title = {Long Short-Term Memory},
|
||||
booktitle = neco,
|
||||
journal = neco,
|
||||
volume = {9},
|
||||
number = {8},
|
||||
month = nov,
|
||||
pages={1735--1780},
|
||||
year = {1997}}
|
||||
|
||||
@inproceedings{TemporalSF,
|
||||
author = {Christoph Vogel and Stefan Roth and Konrad Schindler},
|
||||
title = {View-Consistent 3D Scene Flow Estimation over Multiple Frames},
|
||||
title = {View-Consistent {3D} Scene Flow Estimation over Multiple Frames},
|
||||
booktitle = eccv-2014,
|
||||
Series = eccv-2014-ser,
|
||||
Editor = eccv-2014-eds,
|
||||
Pages = {263--278},
|
||||
Publisher = eccv-2014-pub,
|
||||
Volume = eccv-2014-vol4,
|
||||
Sortmonth = eccv-2014-srtmon,
|
||||
year = eccv-2014-yr}
|
||||
|
||||
@inproceedings{Cityscapes,
|
||||
author = {M. Cordts and M. Omran and S. Ramos and T. Rehfeld and
|
||||
M. Enzweiler and R. Benenson and U. Franke and S. Roth and B. Schiele},
|
||||
title = {The Cityscapes Dataset for Semantic Urban Scene Understanding},
|
||||
title = {The {C}ityscapes Dataset for Semantic Urban Scene Understanding},
|
||||
booktitle = cvpr-2016,
|
||||
year = cvpr-2016-yr,
|
||||
Pages = {3213--3223},
|
||||
address = cvpr-2016-adr,
|
||||
month = cvpr-2016-mon}
|
||||
|
||||
@inproceedings{SGD,
|
||||
@article{SGD,
|
||||
author = {Y. LeCun and B. Boser and J. S. Denker and D. Henderson
|
||||
and R. E. Howard and W. Hubbard and L. D. Jackel},
|
||||
title = {Backpropagation applied to handwritten zip code recognition},
|
||||
booktitle = neco,
|
||||
volume={1},
|
||||
number={4},
|
||||
pages={541-551},
|
||||
journal = neco,
|
||||
year = {1989}}
|
||||
|
||||
@inproceedings{GCNet,
|
||||
author = {Alex Kendall and Hayk Martirosyan and Saumitro Dasgupta and Peter Henry
|
||||
Ryan Kennedy and Abraham Bachrach and Adam Bry},
|
||||
title = {End-to-End Learning of Geometry and Context for Deep Stereo Regression},
|
||||
booktitle = cvpr-2017,
|
||||
year = cvpr-2017-yr,
|
||||
month = cvpr-2017-mon}
|
||||
booktitle = iccv-2017,
|
||||
year = iccv-2017-yr,
|
||||
address = iccv-2017-adr,
|
||||
month = iccv-2017-mon}
|
||||
|
||||
@inproceedings{BN,
|
||||
author = {Sergey Ioffe and Christian Szegedy},
|
||||
title = {Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift},
|
||||
booktitle = icml-2015,
|
||||
address = icml-2015-adr,
|
||||
month = icml-2015-mon,
|
||||
Pages = {448--456},
|
||||
year = icml-2015-yr}
|
||||
|
||||
@inproceedings{He,
|
||||
@ -270,35 +356,58 @@
|
||||
title = {Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification},
|
||||
booktitle = iccv-2015,
|
||||
year = iccv-2015-yr,
|
||||
Pages = {1026--1034},
|
||||
address = iccv-2015-adr,
|
||||
month = iccv-2015-mon}
|
||||
|
||||
@inproceedings{UnFlow,
|
||||
author = {Simon Meister and Junhwa Hur and Stefan Roth},
|
||||
title = {UnFlow: Unsupervised Learning of Optical Flow with a Bidirectional Census Loss},
|
||||
booktitle = aaai-2018,
|
||||
address = aaai-2018-adr,
|
||||
month = aaai-2018-mon,
|
||||
Note = {to appear},
|
||||
year = aaai-2018-yr}
|
||||
|
||||
@inproceedings{UnsupDepth,
|
||||
title={Unsupervised Learning of Depth and Ego-Motion from Video},
|
||||
author={Ravi Garg and BG Vijay Kumar and Gustavo Carneiro and Ian Reid},
|
||||
booktitle=eccv-2016,
|
||||
year=eccv-2016-yr}
|
||||
pages={6612-6619},
|
||||
booktitle = cvpr-2017,
|
||||
year = cvpr-2017-yr,
|
||||
address = cvpr-2017-adr,
|
||||
month = cvpr-2017-mon}
|
||||
|
||||
@inproceedings{UnsupPoseDepth,
|
||||
title={Unsupervised CNN for single view depth estimation: Geometry to the rescue},
|
||||
author={Tinghui Zhou and Matthew Brown and Noah Snavely and David G. Lowe},
|
||||
booktitle=cvpr-2017,
|
||||
year = cvpr-2017-yr,
|
||||
month = cvpr-2017-mon}
|
||||
booktitle=eccv-2016,
|
||||
year=eccv-2016-yr,
|
||||
Series = eccv-2016-ser,
|
||||
Editor = eccv-2016-eds,
|
||||
Pages = {740--756},
|
||||
Publisher = eccv-2016-pub,
|
||||
Volume = eccv-2016-vol8,
|
||||
Sortmonth = eccv-2016-srtmon}
|
||||
|
||||
@inproceedings{UnsupFlownet,
|
||||
title={Back to Basics: Unsupervised Learning of Optical Flow via Brightness Constancy and Motion Smoothness},
|
||||
author={Jason J. Yu and Adam W. Harley and Konstantinos G. Derpanis},
|
||||
booktitle={ECCV 2016 Workshops},
|
||||
Pages = {3--10},
|
||||
Publisher = eccv-2016-pub,
|
||||
Series = eccv-2016-ser,
|
||||
Sortmonth = eccv-2016-srtmon,
|
||||
Volume = eccv-2016-vol3,
|
||||
year={2016}}
|
||||
|
||||
@article{ImageNet,
|
||||
title={ImageNet Large Scale Visual Recognition Challenge},
|
||||
author={Olga Russakovsky and others},
|
||||
booktitle=ijcv,
|
||||
month={Dec},
|
||||
day={01},
|
||||
volume={115},
|
||||
number={3},
|
||||
pages={211--252},
|
||||
journal=ijcv,
|
||||
year={2015}}
|
||||
|
||||
@ -19,6 +19,7 @@ and achieves high accuracy in classifying between moving and non-moving objects,
|
||||
the accuracy of the motion predictions is still not convincing.
|
||||
More work will be thus required to bring the system (closer) to competetive accuracy,
|
||||
which includes trying penalization with the flow loss instead of 3D motion ground truth,
|
||||
experimenting with the weighting between different loss terms,
|
||||
and improvements to the network architecture and training process.
|
||||
We thus presented a partial step towards real time 3D motion estimation based on a
|
||||
physically sound scene decomposition. Thanks to instance-level reasoning, in contrast
|
||||
|
||||
@ -162,6 +162,8 @@ first 144K iterations and $0.25 \cdot 10^{-3}$ for all remaining iterations.
|
||||
For training the RPN and RoI heads and during inference,
|
||||
we use the exact same number of proposals and RoIs as Mask R-CNN in
|
||||
the ResNet and ResNet-FPN variants, respectively.
|
||||
All losses are added up without additional weighting between the loss terms,
|
||||
as in Mask R-CNN.
|
||||
|
||||
\paragraph{Initialization}
|
||||
For initializing the C$_1$ to C$_5$ weights, we use a pre-trained
|
||||
@ -226,7 +228,7 @@ Evaluation of different metrics on the Virtual KITTI validation set.
|
||||
AEE: Average Endpoint Error; Fl-all: Ratio of pixels where flow estimate is
|
||||
wrong by both $\geq 3$ pixels and $\geq 5\%$.
|
||||
We compare network variants with and without FPN.
|
||||
Camera and instance motion errors are averaged over the validation set.
|
||||
All metrics are averaged over all examples in the validation set.
|
||||
Quantities in parentheses in the first row are the average ground truth values for the estimated
|
||||
quantity. For example, we compare the error in camera angle, $E_{R}^{cam} [deg]$,
|
||||
to the average rotation angle in the ground truth camera motions.
|
||||
@ -250,14 +252,17 @@ on the Virtual KITTI validation set.
|
||||
\paragraph{Camera motion}
|
||||
Both variants achieve a low error in predicted camera translation, relative to
|
||||
the average ground truth camera translation. The camera rotation angle error
|
||||
is relatively high compared to the small average ground truth camera rotation.
|
||||
is still relatively high, compared to the small average ground truth camera rotation.
|
||||
Although both variants use the exact same network for predicting the camera motion,
|
||||
the FPN variant performs worse here, with the error in rotation angle twice as high.
|
||||
One possible explanations that should be investigated in futher work is
|
||||
that in the FPN variant, all blocks in the backbone are shared between the camera
|
||||
motion branch and the feature pyramid. In the variant without FPN, the C$5$ and
|
||||
C$6$ blocks are only used in the camera branch, and thus only experience weight
|
||||
updates due to the camera motion loss.
|
||||
updates due to the camera motion loss. This could mean that the weight updates due
|
||||
to the RoI head losses are detrimental to the camera motion estimation.
|
||||
As a remedy, increasing the loss weighting of the camera motion loss may be
|
||||
helpful.
|
||||
|
||||
\paragraph{Instance motion}
|
||||
The object pivots are estimated with relatively (given that the scenes are in a realistic scale)
|
||||
@ -266,13 +271,16 @@ precise, which we ascribe to the higher resolution features used in this variant
|
||||
|
||||
The predicted 3D object translations and rotations still have a relatively high
|
||||
error, compared to the average actual (ground truth) translations and rotations,
|
||||
which may be due to implementation issues or problems with the current 3D motion
|
||||
ground truth loss.
|
||||
The FPN variant is only slightly more accurate for these predictions, which suggests
|
||||
that there may still be issues with our implementation, as one would expect the
|
||||
FPN to be more accurate.
|
||||
which may be due to implementation issues, a non-optimal network architecture,
|
||||
or problems with the current 3D motion ground truth loss
|
||||
(e.g., non-optimal weighting between the components of the motion loss, or between motion and classification losses).
|
||||
Note that the relative error is higher for rotations, which is
|
||||
also the case in the camera motion estimates.
|
||||
The FPN variant is only slightly more accurate for these predictions, which again suggests
|
||||
that there may still be issues with our loss design, loss weighting, or implementation, as one would expect the
|
||||
FPN to yield more accurate motion estimates (as is the case for the pivot estimation).
|
||||
|
||||
\paragraph{Instance segmentation}
|
||||
Looking at Figure \ref{figure:vkitti}, our instance segmentation results are in
|
||||
many cases still lacking the accuracy seen in the Mask R-CNN Cityscapes \cite{MaskRCNN} results,
|
||||
some cases still lacking the accuracy seen in the Mask R-CNN Cityscapes \cite{MaskRCNN} results,
|
||||
which is likely due to implementation details.
|
||||
|
||||
@ -124,7 +124,7 @@ In the following, we will refer to systems which use deep networks for all
|
||||
optimization and do not perform time-critical side computation (e.g. numerical optimization)
|
||||
at inference time as \emph{end-to-end} deep learning systems.
|
||||
|
||||
\paragraph{Deep networks in optical flow}
|
||||
\paragraph{Deep networks in optical flow estimation}
|
||||
|
||||
End-to-end deep networks for optical flow were recently introduced
|
||||
based on encoder-decoder networks or CNN pyramids \cite{FlowNet, FlowNet2, SPyNet},
|
||||
|
||||
@ -1253,7 +1253,8 @@
|
||||
|
||||
@String{aaai-2018 = "Proceedings of the Thirty-Second AAAI Conference on Artificial Intelligence"}
|
||||
@String{aaai-2018-yr = "2018"}
|
||||
@String{aaai-2018-adr = "Menlo Park, California"}
|
||||
@String{aaai-2018-mon = feb}
|
||||
@String{aaai-2018-adr = "New Orleans, Louisiana"}
|
||||
@String{aaai-2018-pub = "AAAI Press"}
|
||||
|
||||
@String{aaai-2005 = "Proceedings of the Twentieth International Conference on Machine Learning"}
|
||||
|
||||
281
old.bib
281
old.bib
@ -1,281 +0,0 @@
|
||||
@inproceedings{FlowNet,
|
||||
author = {Alexey Dosovitskiy and Philipp Fischer and Eddy Ilg
|
||||
and Philip H{\"a}usser and Caner Haz{\i}rba{\c{s}} and
|
||||
Vladimir Golkov and Patrick v.d. Smagt and Daniel Cremers and Thomas Brox},
|
||||
title = {{FlowNet}: Learning Optical Flow with Convolutional Networks},
|
||||
booktitle = {{ICCV}},
|
||||
year = {2015}}
|
||||
|
||||
@inproceedings{FlowNet2,
|
||||
author = {Eddy Ilg and Nikolaus Mayer and Tonmoy Saikia and
|
||||
Margret Keuper and Alexey Dosovitskiy and Thomas Brox},
|
||||
title = {{FlowNet} 2.0: {E}volution of Optical Flow Estimation with Deep Networks},
|
||||
booktitle = {{CVPR}},
|
||||
year = {2017},}
|
||||
|
||||
@inproceedings{SceneFlowDataset,
|
||||
author = {Nikolaus Mayer and Eddy Ilg and Philip H{\"a}usser and Philipp Fischer and
|
||||
Daniel Cremers and Alexey Dosovitskiy and Thomas Brox},
|
||||
title = {A Large Dataset to Train Convolutional Networks for
|
||||
Disparity, Optical Flow, and Scene Flow Estimation},
|
||||
booktitle = {{CVPR}},
|
||||
year = {2016}}
|
||||
|
||||
@article{SfmNet,
|
||||
author = {Sudheendra Vijayanarasimhan and
|
||||
Susanna Ricco and
|
||||
Cordelia Schmid and
|
||||
Rahul Sukthankar and
|
||||
Katerina Fragkiadaki},
|
||||
title = {{SfM-Net}: Learning of Structure and Motion from Video},
|
||||
journal = {arXiv preprint arXiv:1704.07804},
|
||||
year = {2017}}
|
||||
|
||||
@inproceedings{MaskRCNN,
|
||||
Author = {Kaiming He and Georgia Gkioxari and
|
||||
Piotr Doll\'{a}r and Ross Girshick},
|
||||
Title = {{Mask {R-CNN}}},
|
||||
Booktitle = {CVPR},
|
||||
Year = {2017}}
|
||||
|
||||
@inproceedings{FasterRCNN,
|
||||
Author = {Shaoqing Ren and Kaiming He and
|
||||
Ross Girshick and Jian Sun},
|
||||
Title = {Faster {R-CNN}: Towards Real-Time Object Detection
|
||||
with Region Proposal Networks},
|
||||
Booktitle = {{NIPS}},
|
||||
Year = {2015}}
|
||||
|
||||
@inproceedings{FastRCNN,
|
||||
Author = {Ross Girshick},
|
||||
Title = {Fast {R-CNN}},
|
||||
Booktitle = {{ICCV}},
|
||||
Year = {2015}}
|
||||
|
||||
@inproceedings{InstanceSceneFlow,
|
||||
Author = {Aseem Behl and Omid Hosseini Jafari and Siva Karthik Mustikovela and
|
||||
Hassan Abu Alhaija and Carsten Rother and Andreas Geiger},
|
||||
Title = {Bounding Boxes, Segmentations and Object Coordinates:
|
||||
How Important is Recognition for 3D Scene Flow Estimation
|
||||
in Autonomous Driving Scenarios?},
|
||||
Booktitle = {{ICCV}},
|
||||
Year = {2017}}
|
||||
|
||||
@inproceedings{RCNN,
|
||||
Author = {Ross Girshick and
|
||||
Jeff Donahue and
|
||||
Trevor Darrell and
|
||||
Jitendra Malik},
|
||||
Title = {Rich feature hierarchies for accurate
|
||||
object detection and semantic segmentation},
|
||||
Booktitle = {{CVPR}},
|
||||
Year = {2014}}
|
||||
|
||||
@inproceedings{ImageNetCNN,
|
||||
title = {ImageNet Classification with Deep Convolutional Neural Networks},
|
||||
author = {Alex Krizhevsky and Sutskever, Ilya and Hinton, Geoffrey E.},
|
||||
booktitle = {{NIPS}},
|
||||
year = {2012}}
|
||||
|
||||
@inproceedings{VGGNet,
|
||||
author = {Karen Simonyan and Andrew Zisserman},
|
||||
title = {Very Deep Convolutional Networks for Large-Scale Image Recognition},
|
||||
booktitle = {ICLR},
|
||||
year = {2015}}
|
||||
|
||||
@inproceedings{ResNet,
|
||||
author = {Kaiming He and Xiangyu Zhang and Shaoqing Ren and Jian Sun},
|
||||
title = {Deep Residual Learning for Image Recognition},
|
||||
booktitle = {CVPR},
|
||||
year = {2016}}
|
||||
|
||||
@inproceedings{DenseNetDenseFlow,
|
||||
author = {Yi Zhu and Shawn D. Newsam},
|
||||
title = {DenseNet for Dense Flow},
|
||||
booktitle = {ICIP},
|
||||
year = {2017}}
|
||||
|
||||
@inproceedings{SE3Nets,
|
||||
author = {Arunkumar Byravan and Dieter Fox},
|
||||
title = {{SE3-Nets}: Learning Rigid Body Motion using Deep Neural Networks},
|
||||
booktitle = {{ICRA}},
|
||||
year = {2017}}
|
||||
|
||||
@inproceedings{FlowLayers,
|
||||
author = {Laura Sevilla-Lara and Deqing Sun and Varun Jampani and Michael J. Black},
|
||||
title = {Optical Flow with Semantic Segmentation and Localized Layers},
|
||||
booktitle = {{CVPR}},
|
||||
year = {2016}}
|
||||
|
||||
@inproceedings{ESI,
|
||||
author = {Min Bai and Wenjie Luo and Kaustav Kundu and Raquel Urtasun},
|
||||
title = {Exploiting Semantic Information and Deep Matching for Optical Flow},
|
||||
booktitle = {{ECCV}},
|
||||
year = {2016}}
|
||||
|
||||
@inproceedings{VKITTI,
|
||||
author = {Adrien Gaidon and Qiao Wang and Yohann Cabon and Eleonora Vig},
|
||||
title = {Virtual Worlds as Proxy for Multi-Object Tracking Analysis},
|
||||
booktitle = {{CVPR}},
|
||||
year = {2016}}
|
||||
|
||||
@inproceedings{KITTI2012,
|
||||
author = {Andreas Geiger and Philip Lenz and Raquel Urtasun},
|
||||
title = {Are we ready for Autonomous Driving? The {KITTI} Vision Benchmark Suite},
|
||||
booktitle = {{CVPR}},
|
||||
year = {2012}}
|
||||
|
||||
@inproceedings{KITTI2015,
|
||||
author = {Moritz Menze and Andreas Geiger},
|
||||
title = {Object Scene Flow for Autonomous Vehicles},
|
||||
booktitle = {{CVPR}},
|
||||
year = {2015}}
|
||||
|
||||
@inproceedings{PRSF,
|
||||
author = {C. Vogel and K. Schindler and S. Roth},
|
||||
title = {Piecewise Rigid Scene Flow},
|
||||
booktitle = {{ICCV}},
|
||||
year = {2013}}
|
||||
|
||||
@inproceedings{PRSM,
|
||||
author = {C. Vogel and K. Schindler and S. Roth},
|
||||
title = {3D Scene Flow with a Piecewise Rigid Scene Model},
|
||||
booktitle = {{IJCV}},
|
||||
year = {2015}}
|
||||
|
||||
@inproceedings{MRFlow,
|
||||
author = {Jonas Wulff and Laura Sevilla-Lara and Michael J. Black},
|
||||
title = {Optical Flow in Mostly Rigid Scenes},
|
||||
booktitle = {{CVPR}},
|
||||
year = {2017}}
|
||||
|
||||
@inproceedings{SPyNet,
|
||||
author = {Anurag Ranjan and Michael J. Black},
|
||||
title = {Optical Flow Estimation using a Spatial Pyramid Network},
|
||||
booktitle = {CVPR},
|
||||
year = {2017}}
|
||||
|
||||
@inproceedings{FPN,
|
||||
author = {Tsung-Yi Lin and Piotr Dollár and Ross Girshick and Kaiming He and Bharath Hariharan and Serge Belongie},
|
||||
title = {Feature Pyramid Networks for Object Detection},
|
||||
booktitle = {CVPR},
|
||||
year = {2017}}
|
||||
|
||||
@inproceedings{PoseNet,
|
||||
author = {Alex Kendall and Matthew Grimes and Roberto Cipolla},
|
||||
title = {PoseNet: A Convolutional Network for Real-Time 6-DOF Camera Relocalization},
|
||||
booktitle = {ICCV},
|
||||
year = {2015}}
|
||||
|
||||
@inproceedings{PoseNet2,
|
||||
author = {Alex Kendall and Roberto Cipolla},
|
||||
title = {Geometric loss functions for camera pose regression with deep learning},
|
||||
booktitle = {CVPR},
|
||||
year = {2017}}
|
||||
|
||||
@inproceedings{STN,
|
||||
author = {M. Jadeberg and K. Zisserman and K. Kavukcuoglu},
|
||||
title = {Spatial transformer networks},
|
||||
booktitle = {NIPS},
|
||||
year = {2015}}
|
||||
|
||||
@inproceedings{CensusTerm,
|
||||
author = {Fridtjof Stein},
|
||||
title = {Efficient Computation of Optical Flow Using the Census Transform},
|
||||
booktitle = {{DAGM} Symposium},
|
||||
year = {2004}}
|
||||
|
||||
@inproceedings{DeeperDepth,
|
||||
author = {Iro Laina and Christian Rupprecht and Vasileios Belagiannis and Federico Tombari and Nassir Navab},
|
||||
title = {Deeper Depth Prediction with Fully Convolutional Residual Networks},
|
||||
booktitle = {3DV},
|
||||
year = {2016}}
|
||||
|
||||
@inproceedings{TensorFlowObjectDetection,
|
||||
author = {J. Huang and V. Rathod and C. Sun and M. Zhu and A. Korattikara and A. Fathi and I. Fischer and Z. Wojna,
|
||||
and Y. Song and S. Guadarrama and K. Murphy},
|
||||
title = {Speed/accuracy trade-offs for modern convolutional object detectors},
|
||||
booktitle = {CVPR},
|
||||
year = {2017}}
|
||||
|
||||
@misc{TensorFlow,
|
||||
title={{TensorFlow}: Large-Scale Machine Learning on Heterogeneous Systems},
|
||||
url={http://tensorflow.org/},
|
||||
note={Software available from tensorflow.org},
|
||||
author={Martín Abadi and others},
|
||||
year={2015}}
|
||||
|
||||
@inproceedings{LSTM,
|
||||
author = {Sepp Hochreiter and Jürgen Schmidhuber},
|
||||
title = {Long Short-Term Memory},
|
||||
booktitle = {Neural Computation},
|
||||
year = {1997}}
|
||||
|
||||
@inproceedings{TemporalSF,
|
||||
author = {Christoph Vogel and Stefan Roth and Konrad Schindler},
|
||||
title = {View-Consistent 3D Scene Flow Estimation over Multiple Frames},
|
||||
booktitle = {ECCV},
|
||||
year = {2014}}
|
||||
|
||||
@inproceedings{Cityscapes,
|
||||
author = {M. Cordts and M. Omran and S. Ramos and T. Rehfeld and
|
||||
M. Enzweiler and R. Benenson and U. Franke and S. Roth and B. Schiele},
|
||||
title = {The Cityscapes Dataset for Semantic Urban Scene Understanding},
|
||||
booktitle = {CVPR},
|
||||
year = {2016}}
|
||||
|
||||
@inproceedings{SGD,
|
||||
author = {Y. LeCun and B. Boser and J. S. Denker and D. Henderson
|
||||
and R. E. Howard and W. Hubbard and L. D. Jackel},
|
||||
title = {Backpropagation applied to handwritten zip code recognition},
|
||||
booktitle = {Neural Computation},
|
||||
year = {1989}}
|
||||
|
||||
@inproceedings{GCNet,
|
||||
author = {Alex Kendall and Hayk Martirosyan and Saumitro Dasgupta and Peter Henry
|
||||
Ryan Kennedy and Abraham Bachrach and Adam Bry},
|
||||
title = {End-to-End Learning of Geometry and Context for Deep Stereo Regression},
|
||||
booktitle = {CVPR},
|
||||
year = {2017}}
|
||||
|
||||
@inproceedings{BN,
|
||||
author = {Sergey Ioffe and Christian Szegedy},
|
||||
title = {Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift},
|
||||
booktitle = {ICML},
|
||||
year = {2015}}
|
||||
|
||||
@inproceedings{He,
|
||||
author = {Kaiming He and Xiangyu Zhang and Shaoqing Ren and Jian Sun},
|
||||
title = {Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification},
|
||||
booktitle = {ICCV},
|
||||
year = {2015}}
|
||||
|
||||
@inproceedings{UnFlow,
|
||||
author = {Simon Meister and Junhwa Hur and Stefan Roth},
|
||||
title = {UnFlow: Unsupervised Learning of Optical Flow with a Bidirectional Census Loss},
|
||||
booktitle = {AAAI},
|
||||
year = {2018}}
|
||||
|
||||
@inproceedings{UnsupDepth,
|
||||
title={Unsupervised Learning of Depth and Ego-Motion from Video},
|
||||
author={Ravi Garg and BG Vijay Kumar and Gustavo Carneiro and Ian Reid},
|
||||
booktitle={ECCV},
|
||||
year={2016}}
|
||||
|
||||
@inproceedings{UnsupPoseDepth,
|
||||
title={Unsupervised CNN for single view depth estimation: Geometry to the rescue},
|
||||
author={Tinghui Zhou and Matthew Brown and Noah Snavely and David G. Lowe},
|
||||
booktitle={CVPR},
|
||||
year={2017}}
|
||||
|
||||
@inproceedings{UnsupFlownet,
|
||||
title={Back to Basics: Unsupervised Learning of Optical Flow via Brightness Constancy and Motion Smoothness},
|
||||
author={Jason J. Yu and Adam W. Harley and Konstantinos G. Derpanis},
|
||||
booktitle={ECCV Workshops},
|
||||
year={2016}}
|
||||
|
||||
@inproceedings{ImageNet,
|
||||
title={ImageNet Large Scale Visual Recognition Challenge},
|
||||
author={Olga Russakovsky and others},
|
||||
booktitle={IJCV},
|
||||
year={2015}}
|
||||
10
thesis.tex
10
thesis.tex
@ -65,6 +65,8 @@
|
||||
\DeclareFieldFormat*{title}{\mkbibemph{#1\isdot}} % zitierte Titel kursiv formatieren
|
||||
|
||||
\addbibresource{long_extended.bib} % Hier Pfad zu deiner .bib-Datei hineinschreiben
|
||||
%\addbibresource{external.bib} % Hier Pfad zu deiner .bib-Datei hineinschreiben
|
||||
%\addbibresource{papers.bib} % Hier Pfad zu deiner .bib-Datei hineinschreiben
|
||||
\addbibresource{bib.bib} % Hier Pfad zu deiner .bib-Datei hineinschreiben
|
||||
\nocite{*} % Alle Einträge in der .bib-Datei im Literaturverzeichnis ausgeben, auch wenn sie nicht im Text zitiert werden. Gut zum Testen der .bib-Datei, sollte aber nicht generell verwendet werden. Stattdessen lieber gezielt Einträge mit Keywords ausgeben lassen (siehe \printbibliography in Zeile 224).
|
||||
|
||||
@ -94,19 +96,19 @@
|
||||
\author{\myname}
|
||||
\thesistitle{\mytitleen}{\mytitlede}
|
||||
\birthplace{Erbach}
|
||||
\date{21.11.2017}
|
||||
\date{22.11.2017}
|
||||
\referee{\myprof}{M.Sc. Junhwa Hur}
|
||||
\department{\myinstitute}
|
||||
\group{\myfaculty}
|
||||
\dateofexam{23.11.2017}{23.11.2017}
|
||||
\dateofexam{23.11.2017}{22.11.2017}
|
||||
\makethesistitle
|
||||
|
||||
% Eigenständigkeitserklärung: muss nach \makethesistitle erscheinen, sonst wird sie als erste Seite des Dokuments gesetzt.
|
||||
\affidavit[23.11.2017]{\myname}
|
||||
\affidavit[22.11.2017]{\myname}
|
||||
%\affidavit{\myname}
|
||||
|
||||
\pagestyle{myheadings} % Seitenstil umschalten
|
||||
\mymarkright{Version: \today} % Inhalt der Fußzeile
|
||||
%\mymarkright{Version: \today} % Inhalt der Fußzeile
|
||||
|
||||
|
||||
\input{abstract}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user