MRiabov commited on Sep 2, 2025

Commit

5cab910

1 Parent(s): b03a07c

Unpack `.tex` source.

Browse files

Files changed (50) hide show

WireSegHR-tex.tar.gz +3 -0
paper-tex/cvpr.sty +645 -0
paper-tex/egbib.bib +592 -0
paper-tex/figure_tex/additional_visualizations.tex +8 -0
paper-tex/figure_tex/annotation.tex +11 -0
paper-tex/figure_tex/comparison.tex +27 -0
paper-tex/figure_tex/failure.tex +11 -0
paper-tex/figure_tex/inpaint_compare.tex +12 -0
paper-tex/figure_tex/motivation.tex +11 -0
paper-tex/figure_tex/new_failure_cases.tex +9 -0
paper-tex/figure_tex/new_panorama.tex +8 -0
paper-tex/figure_tex/overpredict.tex +14 -0
paper-tex/figure_tex/panorama.tex +9 -0
paper-tex/figure_tex/pipeline.tex +9 -0
paper-tex/figure_tex/pixel6.tex +10 -0
paper-tex/figure_tex/teaser.tex +16 -0
paper-tex/figures/additional_visualizations.pdf +3 -0
paper-tex/figures/annotation.pdf +3 -0
paper-tex/figures/inpainting_result.pdf +3 -0
paper-tex/figures/new_failures.pdf +3 -0
paper-tex/figures/new_panorama.pdf +3 -0
paper-tex/figures/panorama.pdf +3 -0
paper-tex/figures/pixel6.pdf +3 -0
paper-tex/figures/qualitative.pdf +3 -0
paper-tex/figures/teaser2.pdf +3 -0
paper-tex/figures/wire-ablation.pdf +3 -0
paper-tex/figures/wire-pipeline.png +3 -0
paper-tex/figures/wire-pixel6.pdf +3 -0
paper-tex/figures/wire-qualitative.pdf +3 -0
paper-tex/figures/wireishard.pdf +3 -0
paper-tex/ieee_fullname.bst +1135 -0
paper-tex/ms.bbl +337 -0
paper-tex/ms.tex +100 -0
paper-tex/sections/abstract.tex +23 -0
paper-tex/sections/conclusion.tex +8 -0
paper-tex/sections/dataset.tex +32 -0
paper-tex/sections/discussion.tex +43 -0
paper-tex/sections/introduction.tex +86 -0
paper-tex/sections/method.tex +98 -0
paper-tex/sections/method_yq.tex +111 -0
paper-tex/sections/related_work.tex +36 -0
paper-tex/sections/results.tex +156 -0
paper-tex/supplement.bbl +24 -0
paper-tex/supplement.tex +184 -0
paper-tex/tables/component.tex +25 -0
paper-tex/tables/inpaint.tex +30 -0
paper-tex/tables/logit.tex +18 -0
paper-tex/tables/results.tex +53 -0
paper-tex/tables/stats.tex +22 -0
paper-tex/tables/thresholds.tex +22 -0

WireSegHR-tex.tar.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0a17096a2eaad07f51345426465697fcf0ee1a0c5b54aa6742b4ac23406f6bc4
+size 33690376

paper-tex/cvpr.sty ADDED Viewed

	@@ -0,0 +1,645 @@

+% ---------------------------------------------------------------
+%
+% The last version of CVPR/ICCV LaTeX template had been developed
+% by [email protected] and [email protected] about 15 years ago.
+% That version suffered from several issues:
+% 1. Authors needed several individual files: cvpr.sty,
+%    cvpr_eso.sty, eso-pic.sty.
+% 2. For CVPR/ICCV rebuttals, another version of cvpr.sty was
+%    required.
+% 3. Several warnings arose due to deprecated options.
+%
+% More recently, Ming-Ming Cheng ([email protected]) created
+% a single style file that helps to unify review, rebuttal, and
+% final versions with a class.
+%
+% This more recent style has been further modernized for CVPR 2022
+% by Stefan Roth ([email protected]).
+%
+% Acknowledgements: This file is built on the template by
+% Ming-Ming Cheng (https://github.com/MCG-NKU/CVPR_Template).
+% ---------------------------------------------------------------
+% ---------------------------------------------------------------
+%
+% $Id: cvpr.sty,v 1.3 2005/10/24 19:56:15 awf Exp $
+% by [email protected] some mods by [email protected]
+%
+% ---------------------------------------------------------------
+%
+% no guarantee is given that the format corresponds perfectly to
+% IEEE 8.5" x 11" Proceedings, but most features should be ok.
+%
+% ---------------------------------------------------------------
+% with LaTeX2e:
+% =============
+%
+% use as
+%   \documentclass[times,10pt,twocolumn]{article}
+%   \usepackage[options]{cvpr}
+%   \usepackage{times}
+%
+% "options" should be replaced by
+%  * "review" for submitting a paper for review,
+%  * "final" for the camera ready, and
+%  * "rebuttal" for the author rebuttal.
+%
+% specify references as
+%   {\small
+%   \bibliographystyle{ieee}
+%   \bibliography{...your files...}
+%   }
+% ---------------------------------------------------------------
+% ---------------------------------------------------------------
+%
+%\usepackage{eso-pic}
+%
+%%
+%% This is file `eso-pic.sty',
+%% generated with the docstrip utility.
+%%
+%% The original source files were:
+%%
+%% eso-pic.dtx  (with options: `package')
+%%
+%% This is a generated file.
+%%
+%% Copyright (C) 1998-2002 by Rolf Niepraschk <[email protected]>
+%%
+%% This file may be distributed and/or modified under the conditions of
+%% the LaTeX Project Public License, either version 1.2 of this license
+%% or (at your option) any later version.  The latest version of this
+%% license is in:
+%%
+%%    http://www.latex-project.org/lppl.txt
+%%
+%% and version 1.2 or later is part of all distributions of LaTeX version
+%% 1999/12/01 or later.
+%%
+%
+\NeedsTeXFormat{LaTeX2e}[1999/12/01]
+\ProvidesPackage{cvpr}[2021/08/23 Example LaTex class for IEEE CVPR]
+\RequirePackage{times}    % Integrate Times for here
+\RequirePackage{cite}     % Automatically ordered citations
+\RequirePackage{xspace}
+\RequirePackage{silence}  % Suppress unwanted warnings
+\hbadness=10000 \vbadness=10000 \vfuzz=30pt \hfuzz=30pt
+\WarningFilter{latexfont}{Font shape declaration}
+\WarningFilter{latex}{Font shape}
+\WarningFilter{hyperref}{Token not allowed in a PDF string}
+\WarningFilter[rebuttal]{latex}{No \author given}
+\RequirePackage{etoolbox}
+% Use modern caption package to allow for sub-figures etc.
+% Reproduces the original CVPR/ICCV style as closely as possible.
+\RequirePackage[format=plain,labelformat=simple,labelsep=period,font=small,compatibility=false]{caption}
+\RequirePackage[font=footnotesize,skip=3pt,subrefformat=parens]{subcaption}
+\newtoggle{cvprfinal}        % Camera-ready version
+\newtoggle{cvprrebuttal}     % Rebuttal
+\newtoggle{cvprpagenumbers}  % Force page numbers (in camera ready)
+\toggletrue{cvprfinal}
+\togglefalse{cvprrebuttal}
+\togglefalse{cvprpagenumbers}
+\DeclareOption{review}{\togglefalse{cvprfinal}\toggletrue{cvprpagenumbers}}
+\DeclareOption{rebuttal}{\togglefalse{cvprfinal}\toggletrue{cvprrebuttal}}
+\DeclareOption{pagenumbers}{\toggletrue{cvprpagenumbers}}
+\DeclareOption*{\PackageWarning{cvpr}{Unkown option `\CurrentOption'}}
+\ProcessOptions\relax
+% Don't warn about missing author for rebuttal
+\iftoggle{cvprrebuttal}{%
+  \ActivateWarningFilters[rebuttal]
+}{}
+% Breaking lines for URLs in the bib
+\RequirePackage[hyphens]{url}
+\Urlmuskip=0mu plus 1mu\relax
+% ---------------------------------------------------------------
+%\input{everyshi.sty}
+\newcommand{\@EveryShipout@Hook}{}
+\newcommand{\@EveryShipout@AtNextHook}{}
+\newcommand*{\EveryShipout}[1]
+   {\g@addto@macro\@EveryShipout@Hook{#1}}
+\newcommand*{\AtNextShipout}[1]
+   {\g@addto@macro\@EveryShipout@AtNextHook{#1}}
+\newcommand{\@EveryShipout@Shipout}{%
+   \afterassignment\@EveryShipout@Test
+   \global\setbox\@cclv= %
+   }
+\newcommand{\@EveryShipout@Test}{%
+   \ifvoid\@cclv\relax
+      \aftergroup\@EveryShipout@Output
+   \else
+      \@EveryShipout@Output
+   \fi%
+   }
+\newcommand{\@EveryShipout@Output}{%
+   \@EveryShipout@Hook%
+   \@EveryShipout@AtNextHook%
+      \gdef\@EveryShipout@AtNextHook{}%
+   \@EveryShipout@Org@Shipout\box\@cclv%
+   }
+\newcommand{\@EveryShipout@Org@Shipout}{}
+\newcommand*{\@EveryShipout@Init}{%
+   \message{ABD: EveryShipout initializing macros}%
+   \let\@EveryShipout@Org@Shipout\shipout
+   \let\shipout\@EveryShipout@Shipout
+   }
+\AtBeginDocument{\@EveryShipout@Init}
+% ---------------------------------------------------------------
+\newcommand\LenToUnit[1]{#1\@gobble}
+\newcommand\AtPageUpperLeft[1]{%
+  \begingroup
+    \@tempdima=0pt\relax\@tempdimb=\ESO@yoffsetI\relax
+    \put(\LenToUnit{\@tempdima},\LenToUnit{\@tempdimb}){#1}%
+  \endgroup
+}
+\newcommand\AtPageLowerLeft[1]{\AtPageUpperLeft{%
+  \put(0,\LenToUnit{-\paperheight}){#1}}}
+\newcommand\AtPageCenter[1]{\AtPageUpperLeft{%
+  \put(\LenToUnit{.5\paperwidth},\LenToUnit{-.5\paperheight}){#1}}%
+}
+\newcommand\AtTextUpperLeft[1]{%
+  \begingroup
+    \setlength\@tempdima{1in}%
+    \ifodd\c@page%
+      \advance\@tempdima\oddsidemargin%
+    \else%
+      \advance\@tempdima\evensidemargin%
+    \fi%
+    \@tempdimb=\ESO@yoffsetI\relax\advance\@tempdimb-1in\relax%
+    \advance\@tempdimb-\topmargin%
+    \advance\@tempdimb-\headheight\advance\@tempdimb-\headsep%
+    \put(\LenToUnit{\@tempdima},\LenToUnit{\@tempdimb}){#1}%
+  \endgroup
+}
+\newcommand\AtTextLowerLeft[1]{\AtTextUpperLeft{%
+  \put(0,\LenToUnit{-\textheight}){#1}}}
+\newcommand\AtTextCenter[1]{\AtTextUpperLeft{%
+  \put(\LenToUnit{.5\textwidth},\LenToUnit{-.5\textheight}){#1}}}
+\newcommand{\ESO@HookI}{} \newcommand{\ESO@HookII}{}
+\newcommand{\ESO@HookIII}{}
+\newcommand{\AddToShipoutPicture}{%
+  \@ifstar{\g@addto@macro\ESO@HookII}{\g@addto@macro\ESO@HookI}}
+\newcommand{\ClearShipoutPicture}{\global\let\ESO@HookI\@empty}
+\newcommand\ESO@isMEMOIR[1]{}
+\@ifclassloaded{memoir}{\renewcommand\ESO@isMEMOIR[1]{#1}}{}
+\newcommand{\@ShipoutPicture}{%
+  \bgroup
+    \@tempswafalse%
+    \ifx\ESO@HookI\@empty\else\@tempswatrue\fi%
+    \ifx\ESO@HookII\@empty\else\@tempswatrue\fi%
+    \ifx\ESO@HookIII\@empty\else\@tempswatrue\fi%
+    \if@tempswa%
+      \@tempdima=1in\@tempdimb=-\@tempdima%
+      \advance\@tempdimb\ESO@yoffsetI%
+      \ESO@isMEMOIR{%
+        \advance\@tempdima\trimedge%
+        \advance\@tempdima\paperwidth%
+        \advance\@tempdima-\stockwidth%
+        \if@twoside\ifodd\c@page\else%
+          \advance\@tempdima-2\trimedge%
+          \advance\@tempdima-\paperwidth%
+          \advance\@tempdima\stockwidth%
+        \fi\fi%
+        \advance\@tempdimb\trimtop}%
+      \unitlength=1pt%
+      \global\setbox\@cclv\vbox{%
+        \vbox{\let\protect\relax
+          \pictur@(0,0)(\strip@pt\@tempdima,\strip@pt\@tempdimb)%
+            \ESO@HookIII\ESO@HookI\ESO@HookII%
+            \global\let\ESO@HookII\@empty%
+          \endpicture}%
+          \nointerlineskip%
+        \box\@cclv}%
+    \fi
+  \egroup
+}
+\EveryShipout{\@ShipoutPicture}
+\RequirePackage{keyval}
+\newif\ifESO@dvips\ESO@dvipsfalse \newif\ifESO@grid\ESO@gridfalse
+\newif\ifESO@texcoord\ESO@texcoordfalse
+\newcommand*\ESO@gridunitname{}
+\newcommand*\ESO@gridunit{}
+\newcommand*\ESO@labelfactor{}
+\newcommand*\ESO@griddelta{}\newcommand*\ESO@griddeltaY{}
+\newcommand*\ESO@gridDelta{}\newcommand*\ESO@gridDeltaY{}
+\newcommand*\ESO@gridcolor{}
+\newcommand*\ESO@subgridcolor{}
+\newcommand*\ESO@subgridstyle{dotted}% ???
+\newcommand*\ESO@gap{}
+\newcommand*\ESO@yoffsetI{}\newcommand*\ESO@yoffsetII{}
+\newcommand*\ESO@gridlines{\thinlines}
+\newcommand*\ESO@subgridlines{\thinlines}
+\newcommand*\ESO@hline[1]{\ESO@subgridlines\line(1,0){#1}}
+\newcommand*\ESO@vline[1]{\ESO@subgridlines\line(0,1){#1}}
+\newcommand*\ESO@Hline[1]{\ESO@gridlines\line(1,0){#1}}
+\newcommand*\ESO@Vline[1]{\ESO@gridlines\line(0,1){#1}}
+\newcommand\ESO@fcolorbox[4][]{\fbox{#4}}
+\newcommand\ESO@color[1]{}
+\newcommand\ESO@colorbox[3][]{%
+  \begingroup
+    \fboxrule=0pt\fbox{#3}%
+  \endgroup
+}
+\newcommand\gridSetup[6][]{%
+  \edef\ESO@gridunitname{#1}\edef\ESO@gridunit{#2}
+  \edef\ESO@labelfactor{#3}\edef\ESO@griddelta{#4}
+  \edef\ESO@gridDelta{#5}\edef\ESO@gap{#6}}
+\define@key{ESO}{texcoord}[true]{\csname ESO@texcoord#1\endcsname}
+\define@key{ESO}{pscoord}[true]{\csname @tempswa#1\endcsname
+  \if@tempswa\ESO@texcoordfalse\else\ESO@texcoordtrue\fi}
+\define@key{ESO}{dvips}[true]{\csname ESO@dvips#1\endcsname}
+\define@key{ESO}{grid}[true]{\csname ESO@grid#1\endcsname
+  \setkeys{ESO}{gridcolor=black,subgridcolor=black}}
+\define@key{ESO}{colorgrid}[true]{\csname ESO@grid#1\endcsname
+  \setkeys{ESO}{gridcolor=red,subgridcolor=green}}
+\define@key{ESO}{gridcolor}{\def\ESO@gridcolor{#1}}
+\define@key{ESO}{subgridcolor}{\def\ESO@subgridcolor{#1}}
+\define@key{ESO}{subgridstyle}{\def\ESO@subgridstyle{#1}}%
+\define@key{ESO}{gridunit}{%
+  \def\@tempa{#1}
+  \def\@tempb{bp}
+  \ifx\@tempa\@tempb
+    \gridSetup[\@tempa]{1bp}{1}{10}{50}{2}
+  \else
+    \def\@tempb{pt}
+    \ifx\@tempa\@tempb
+      \gridSetup[\@tempa]{1pt}{1}{10}{50}{2}
+    \else
+      \def\@tempb{in}
+      \ifx\@tempa\@tempb
+        \gridSetup[\@tempa]{.1in}{.1}{2}{10}{.5}
+      \else
+        \gridSetup[mm]{1mm}{1}{5}{20}{1}
+      \fi
+    \fi
+  \fi
+}
+%\setkeys{ESO}{subgridstyle=solid,pscoord=true,gridunit=mm}
+% \def\ProcessOptionsWithKV#1{%
+%   \let\@tempc\@empty
+%   \@for\CurrentOption:=\@classoptionslist\do{%
+%     \@ifundefined{KV@#1@\CurrentOption}%
+%     {}{\edef\@tempc{\@tempc,\CurrentOption,}}}%
+%   \edef\@tempc{%
+%     \noexpand\setkeys{#1}{\@tempc\@ptionlist{\@currname.\@currext}}
+%   }%
+%   \@tempc
+%   \AtEndOfPackage{\let\@unprocessedoptions\relax}}%
+%\ProcessOptionsWithKV{ESO}%
+\newcommand\ESO@div[2]{%
+  \@tempdima=#1\relax\@tempdimb=\ESO@gridunit\relax
+  \@tempdimb=#2\@tempdimb\divide\@tempdima by \@tempdimb%
+  \@tempcnta\@tempdima\advance\@tempcnta\@ne}
+\AtBeginDocument{%
+  \IfFileExists{color.sty}
+  {%
+    \RequirePackage{color}
+    \let\ESO@color=\color\let\ESO@colorbox=\colorbox
+    \let\ESO@fcolorbox=\fcolorbox
+  }{}
+  \@ifundefined{Gin@driver}{}%
+  {%
+    \ifx\Gin@driver\@empty\else%
+      \filename@parse{\Gin@driver}\def\reserved@a{dvips}%
+      \ifx\filename@base\reserved@a\ESO@dvipstrue\fi%
+    \fi
+  }%
+  \ifx\pdfoutput\undefined\else
+    \ifx\pdfoutput\relax\else
+      \ifcase\pdfoutput\else
+        \ESO@dvipsfalse%
+      \fi
+    \fi
+  \fi
+  \ifESO@dvips\def\@tempb{eepic}\else\def\@tempb{epic}\fi
+  \def\@tempa{dotted}%\def\ESO@gap{\LenToUnit{6\@wholewidth}}%
+  \ifx\@tempa\ESO@subgridstyle
+    \IfFileExists{\@tempb.sty}%
+    {%
+      \RequirePackage{\@tempb}
+      \renewcommand*\ESO@hline[1]{\ESO@subgridlines\dottedline{\ESO@gap}%
+        (0,0)(##1,0)}
+      \renewcommand*\ESO@vline[1]{\ESO@subgridlines\dottedline{\ESO@gap}%
+        (0,0)(0,##1)}
+    }{}
+  \else
+    \ifx\ESO@gridcolor\ESO@subgridcolor%
+      \renewcommand*\ESO@gridlines{\thicklines}
+    \fi
+  \fi
+}
+\ifESO@texcoord
+  \def\ESO@yoffsetI{0pt}\def\ESO@yoffsetII{-\paperheight}
+  \edef\ESO@griddeltaY{-\ESO@griddelta}\edef\ESO@gridDeltaY{-\ESO@gridDelta}
+\else
+  \def\ESO@yoffsetI{\paperheight}\def\ESO@yoffsetII{0pt}
+  \edef\ESO@griddeltaY{\ESO@griddelta}\edef\ESO@gridDeltaY{\ESO@gridDelta}
+\fi
+\newcommand\ESO@gridpicture{%
+  \begingroup
+    \setlength\unitlength{\ESO@gridunit}%
+    \ESO@color{\ESO@subgridcolor}%
+    \ESO@div{\paperheight}{\ESO@griddelta}%
+    \multiput(0,0)(0,\ESO@griddeltaY){\@tempcnta}%
+      {\ESO@hline{\LenToUnit{\paperwidth}}}%
+    \ESO@div{\paperwidth}{\ESO@griddelta}%
+    \multiput(0,\LenToUnit{\ESO@yoffsetII})(\ESO@griddelta,0){\@tempcnta}%
+      {\ESO@vline{\LenToUnit{\paperheight}}}%
+    \ESO@color{\ESO@gridcolor}%
+    \ESO@div{\paperheight}{\ESO@gridDelta}%
+    \multiput(0,0)(0,\ESO@gridDeltaY){\@tempcnta}%
+      {\ESO@Hline{\LenToUnit{\paperwidth}}}%
+    \ESO@div{\paperwidth}{\ESO@gridDelta}%
+    \multiput(0,\LenToUnit{\ESO@yoffsetII})(\ESO@gridDelta,0){\@tempcnta}%
+      {\ESO@Vline{\LenToUnit{\paperheight}}}%
+    \fontsize{10}{12}\normalfont%
+    \ESO@div{\paperwidth}{\ESO@gridDelta}%
+    \multiput(0,\ESO@gridDeltaY)(\ESO@gridDelta,0){\@tempcnta}{%
+      \@tempcntb=\@tempcnta\advance\@tempcntb-\@multicnt%
+      \ifnum\@tempcntb>1\relax
+        \multiply\@tempcntb by \ESO@gridDelta\relax%
+        \@tempdima=\@tempcntb sp\@tempdima=\ESO@labelfactor\@tempdima%
+        \@tempcntb=\@tempdima%
+        \makebox(0,0)[c]{\ESO@colorbox{white}{\the\@tempcntb}}%
+      \fi}%
+    \ifx\ESO@gridunitname\@empty\def\@tempa{0}\else\def\@tempa{1}\fi%
+    \ESO@div{\paperheight}{\ESO@gridDelta}%
+    \multiput(\ESO@gridDelta,0)(0,\ESO@gridDeltaY){\@tempcnta}{%
+      \@tempcntb=\@tempcnta\advance\@tempcntb-\@multicnt%
+      \ifnum\@tempcntb>\@tempa\relax
+        \multiply\@tempcntb by \ESO@gridDelta\relax%
+        \@tempdima=\@tempcntb sp\@tempdima=\ESO@labelfactor\@tempdima%
+        \@tempcntb=\@tempdima%
+        \makebox(0,0)[c]{\ESO@colorbox{white}{\the\@tempcntb}}%
+      \fi
+    }%
+    \ifx\ESO@gridunitname\@empty\else%
+      \thicklines\fboxrule=\@wholewidth%
+      \put(\ESO@gridDelta,\ESO@gridDeltaY){\makebox(0,0)[c]{%
+        \ESO@fcolorbox{\ESO@gridcolor}{white}{%
+          \textbf{\ESO@gridunitname}}}}%
+    \fi
+    \normalcolor%
+  \endgroup
+}
+\ifESO@grid\g@addto@macro\ESO@HookIII{\ESO@gridpicture}\fi
+% ---------------------------------------------------------------
+\typeout{CVPR 8.5 x 11-Inch Proceedings Style `cvpr.sty'.}
+% ten point helvetica bold required for captions
+% eleven point times bold required for second-order headings
+% in some sites the name of the fonts may differ,
+% change the name here:
+\font\cvprtenhv  = phvb at 8pt % *** IF THIS FAILS, SEE cvpr.sty ***
+\font\elvbf  = ptmb scaled 1100
+% If the above lines give an error message, try to comment them and
+% uncomment these:
+%\font\cvprtenhv  = phvb7t at 8pt
+%\font\elvbf  = ptmb7t scaled 1100
+% set dimensions of columns, gap between columns, and paragraph indent
+\setlength{\textheight}{8.875in}
+\setlength{\textwidth}{6.875in}
+\setlength{\columnsep}{0.3125in}
+\setlength{\topmargin}{0in}
+\setlength{\headheight}{0in}
+\setlength{\headsep}{0in}
+\setlength{\parindent}{1pc}
+\setlength{\oddsidemargin}{-.304in}
+\setlength{\evensidemargin}{-.304in}
+% memento from size10.clo
+% \normalsize{\@setfontsize\normalsize\@xpt\@xiipt}
+% \small{\@setfontsize\small\@ixpt{11}}
+% \footnotesize{\@setfontsize\footnotesize\@viiipt{9.5}}
+% \scriptsize{\@setfontsize\scriptsize\@viipt\@viiipt}
+% \tiny{\@setfontsize\tiny\@vpt\@vipt}
+% \large{\@setfontsize\large\@xiipt{14}}
+% \Large{\@setfontsize\Large\@xivpt{18}}
+% \LARGE{\@setfontsize\LARGE\@xviipt{22}}
+% \huge{\@setfontsize\huge\@xxpt{25}}
+% \Huge{\@setfontsize\Huge\@xxvpt{30}}
+% Suppress page numbers when the appropriate option is given
+\iftoggle{cvprpagenumbers}{}{%
+  \pagestyle{empty}
+}
+\AtBeginDocument{%
+  % Print an error if document class other than article is used
+  \@ifclassloaded{article}{}{%
+    \PackageError{cvpr}{Package only meant to be used with document class `article'}{Change document class to `article'.}
+  }
+  % Print a warning if incorrect options for article are specified
+  \@ifclasswith{article}{10pt}{}{%
+    \PackageWarningNoLine{cvpr}{Incorrect font size specified - CVPR requires 10-point fonts. Please load document class `article' with `10pt' option}
+  }
+  \@ifclasswith{article}{twocolumn}{}{%
+    \PackageWarningNoLine{cvpr}{Single column document - CVPR requires papers to have two-column layout. Please load document class `article' with `twocolumn' option}
+  }
+  \@ifclasswith{article}{letterpaper}{}{%
+    \PackageWarningNoLine{cvpr}{Incorrect paper size - CVPR uses paper size `letter'. Please load document class `article' with `letterpaper' option}
+  }
+  % Print a warning if hyperref is not loaded and/or if the pagebackref option is missing
+  \iftoggle{cvprfinal}{%
+    \@ifpackageloaded{hyperref}{}{%
+      \PackageWarningNoLine{cvpr}{Package `hyperref' is not loaded, but highly recommended for camera-ready version}
+    }
+  }{%
+    \@ifpackageloaded{hyperref}{
+      \@ifpackagewith{hyperref}{pagebackref}{}{
+        \PackageWarningNoLine{cvpr}{Package `hyperref' is not loaded with option `pagebackref', which is strongly recommended for review version}
+      }
+    }{%
+      \PackageWarningNoLine{cvpr}{Package `hyperref' is not loaded, but strongly recommended for review version}
+    }
+  }
+}
+\def\@maketitle
+   {
+   \newpage
+   \null
+   \iftoggle{cvprrebuttal}{\vspace*{-.3in}}{\vskip .375in}
+   \begin{center}
+      % smaller title font only for rebuttal
+      \iftoggle{cvprrebuttal}{{\large \bf \@title \par}}{{\Large \bf \@title \par}}
+      % additional two empty lines at the end of the title
+      \iftoggle{cvprrebuttal}{\vspace*{-22pt}}{\vspace*{24pt}}
+      {
+      \large
+      \lineskip .5em
+      \begin{tabular}[t]{c}
+        \iftoggle{cvprfinal}{
+          \@author
+        }{
+          \iftoggle{cvprrebuttal}{}{
+            Anonymous \confName~submission\\
+            \vspace*{1pt}\\
+            Paper ID \cvprPaperID
+          }
+        }
+      \end{tabular}
+      \par
+      }
+      % additional small space at the end of the author name
+      \vskip .5em
+      % additional empty line at the end of the title block
+      \vspace*{12pt}
+   \end{center}
+   }
+\def\abstract
+   {%
+   % Suppress page numbers when the appropriate option is given
+   \iftoggle{cvprpagenumbers}{}{%
+     \thispagestyle{empty}
+   }
+   \centerline{\large\bf Abstract}%
+   \vspace*{12pt}%
+   \it%
+   }
+\def\endabstract
+   {
+   % additional empty line at the end of the abstract
+   \vspace*{12pt}
+   }
+\def\affiliation#1{\gdef\@affiliation{#1}} \gdef\@affiliation{}
+% correct heading spacing and type
+\def\cvprsection{\@startsection {section}{1}{\z@}
+   {10pt plus 2pt minus 2pt}{7pt} {\large\bf}}
+\def\cvprssect#1{\cvprsection*{#1}}
+\def\cvprsect#1{\cvprsection{\hskip -1em.~#1}}
+\def\section{\@ifstar\cvprssect\cvprsect}
+\def\cvprsubsection{\@startsection {subsection}{2}{\z@}
+   {8pt plus 2pt minus 2pt}{6pt} {\elvbf}}
+\def\cvprssubsect#1{\cvprsubsection*{#1}}
+\def\cvprsubsect#1{\cvprsubsection{\hskip -1em.~#1}}
+\def\subsection{\@ifstar\cvprssubsect\cvprsubsect}
+%% --------- Page background marks: Ruler and confidentiality
+% ----- define vruler
+\makeatletter
+\newbox\cvprrulerbox
+\newcount\cvprrulercount
+\newdimen\cvprruleroffset
+\newdimen\cv@lineheight
+\newdimen\cv@boxheight
+\newbox\cv@tmpbox
+\newcount\cv@refno
+\newcount\cv@tot
+% NUMBER with left flushed zeros  \fillzeros[<WIDTH>]<NUMBER>
+\newcount\cv@tmpc@ \newcount\cv@tmpc
+\def\fillzeros[#1]#2{\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi
+\cv@tmpc=1 %
+\loop\ifnum\cv@tmpc@<10 \else \divide\cv@tmpc@ by 10 \advance\cv@tmpc by 1 \fi
+   \ifnum\cv@tmpc@=10\relax\cv@tmpc@=11\relax\fi \ifnum\cv@tmpc@>10 \repeat
+\ifnum#2<0\advance\cv@tmpc1\relax-\fi
+\loop\ifnum\cv@tmpc<#1\relax0\advance\cv@tmpc1\relax\fi \ifnum\cv@tmpc<#1 \repeat
+\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi \relax\the\cv@tmpc@}%
+% \makevruler[<SCALE>][<INITIAL_COUNT>][<STEP>][<DIGITS>][<HEIGHT>]
+\def\makevruler[#1][#2][#3][#4][#5]{\begingroup\offinterlineskip
+\textheight=#5\vbadness=10000\vfuzz=120ex\overfullrule=0pt%
+\global\setbox\cvprrulerbox=\vbox to \textheight{%
+{\parskip=0pt\hfuzz=150em\cv@boxheight=\textheight
+\cv@lineheight=#1\global\cvprrulercount=#2%
+\cv@tot\cv@boxheight\divide\cv@tot\cv@lineheight\advance\cv@tot2%
+\cv@refno1\vskip-\cv@lineheight\vskip1ex%
+\loop\setbox\cv@tmpbox=\hbox to0cm{{\cvprtenhv\hfil\fillzeros[#4]\cvprrulercount}}%
+\ht\cv@tmpbox\cv@lineheight\dp\cv@tmpbox0pt\box\cv@tmpbox\break
+\advance\cv@refno1\global\advance\cvprrulercount#3\relax
+\ifnum\cv@refno<\cv@tot\repeat}}\endgroup}%
+\makeatother
+% ----- end of vruler
+% \makevruler[<SCALE>][<INITIAL_COUNT>][<STEP>][<DIGITS>][<HEIGHT>]
+\def\cvprruler#1{\makevruler[12pt][#1][1][3][0.993\textheight]\usebox{\cvprrulerbox}}
+\AddToShipoutPicture{%
+  \iftoggle{cvprfinal}{
+  }{
+    \cvprruleroffset=\textheight
+    \advance\cvprruleroffset by -3.7pt
+      \color[rgb]{.5,.5,1}
+      \AtTextUpperLeft{%
+        \put(\LenToUnit{-35pt},\LenToUnit{-\cvprruleroffset}){%left ruler
+          \cvprruler{\cvprrulercount}}
+        %\put(\LenToUnit{\textwidth\kern 30pt},\LenToUnit{-\cvprruleroffset})
+        \put(\LenToUnit{\dimexpr \textwidth+30pt},\LenToUnit{-\cvprruleroffset}){%right ruler
+          \cvprruler{\cvprrulercount}}
+      }
+    \def\pid{\parbox{1in}{\begin{center}\bf\sf{\small \confName}\\\#\cvprPaperID\end{center}}}
+      \AtTextUpperLeft{%paperID in corners
+        \put(\LenToUnit{-65pt},\LenToUnit{45pt}){\pid}
+        \put(\LenToUnit{\textwidth\kern-8pt},\LenToUnit{45pt}){\pid}
+      }
+      \AtTextUpperLeft{%confidential
+        \put(0,\LenToUnit{1cm}){\parbox{\textwidth}{\centering\cvprtenhv
+        \confName~\confYear~Submission \#\cvprPaperID. CONFIDENTIAL REVIEW COPY.  DO NOT DISTRIBUTE.}}
+      }
+  }
+}
+%%% Make figure placement a little more predictable.
+% We trust the user to move figures if this results
+% in ugliness.
+% Minimize bad page breaks at figures
+\renewcommand{\textfraction}{0.01}
+\renewcommand{\floatpagefraction}{0.99}
+\renewcommand{\topfraction}{0.99}
+\renewcommand{\bottomfraction}{0.99}
+\renewcommand{\dblfloatpagefraction}{0.99}
+\renewcommand{\dbltopfraction}{0.99}
+\setcounter{totalnumber}{99}
+\setcounter{topnumber}{99}
+\setcounter{bottomnumber}{99}
+% Add a period to the end of an abbreviation unless there's one
+% already, then \xspace.
+\makeatletter
+\DeclareRobustCommand\onedot{\futurelet\@let@token\@onedot}
+\def\@onedot{\ifx\@let@token.\else.\null\fi\xspace}
+\def\eg{\emph{e.g}\onedot} \def\Eg{\emph{E.g}\onedot}
+\def\ie{\emph{i.e}\onedot} \def\Ie{\emph{I.e}\onedot}
+\def\cf{\emph{cf}\onedot} \def\Cf{\emph{Cf}\onedot}
+\def\etc{\emph{etc}\onedot} \def\vs{\emph{vs}\onedot}
+\def\wrt{w.r.t\onedot} \def\dof{d.o.f\onedot}
+\def\iid{i.i.d\onedot} \def\wolog{w.l.o.g\onedot}
+\def\etal{\emph{et al}\onedot}
+\makeatother
+% ---------------------------------------------------------------

paper-tex/egbib.bib ADDED Viewed

	@@ -0,0 +1,592 @@

+@String(PAMI = {IEEE Trans. Pattern Anal. Mach. Intell.})
+@String(IJCV = {Int. J. Comput. Vis.})
+@String(CVPR= {IEEE Conf. Comput. Vis. Pattern Recog.})
+@String(ICCV= {Int. Conf. Comput. Vis.})
+@String(ECCV= {Eur. Conf. Comput. Vis.})
+@String(NIPS= {Adv. Neural Inform. Process. Syst.})
+@String(ICPR = {Int. Conf. Pattern Recog.})
+@String(BMVC= {Brit. Mach. Vis. Conf.})
+@String(TOG= {ACM Trans. Graph.})
+@String(TIP  = {IEEE Trans. Image Process.})
+@String(TVCG  = {IEEE Trans. Vis. Comput. Graph.})
+@String(TMM  = {IEEE Trans. Multimedia})
+@String(ACMMM= {ACM Int. Conf. Multimedia})
+@String(ICME = {Int. Conf. Multimedia and Expo})
+@String(ICASSP=	{ICASSP})
+@String(ICIP = {IEEE Int. Conf. Image Process.})
+@String(ACCV  = {ACCV})
+@String(ICLR = {Int. Conf. Learn. Represent.})
+@String(IJCAI = {IJCAI})
+@String(PR   = {Pattern Recognition})
+@String(AAAI = {AAAI})
+@String(CVPRW= {IEEE Conf. Comput. Vis. Pattern Recog. Worksh.})
+@String(CSVT = {IEEE Trans. Circuit Syst. Video Technol.})
+@String(SPL	= {IEEE Sign. Process. Letters})
+@String(VR   = {Vis. Res.})
+@String(JOV	 = {J. Vis.})
+@String(TVC  = {The Vis. Comput.})
+@String(JCST  = {J. Comput. Sci. Tech.})
+@String(CGF  = {Comput. Graph. Forum})
+@String(CVM = {Computational Visual Media})
+@String(PAMI  = {IEEE TPAMI})
+@String(IJCV  = {IJCV})
+@String(CVPR  = {CVPR})
+@String(ICCV  = {ICCV})
+@String(ECCV  = {ECCV})
+@String(NIPS  = {NeurIPS})
+@String(ICPR  = {ICPR})
+@String(BMVC  =	{BMVC})
+@String(TOG   = {ACM TOG})
+@String(TIP   = {IEEE TIP})
+@String(TVCG  = {IEEE TVCG})
+@String(TCSVT = {IEEE TCSVT})
+@String(TMM   =	{IEEE TMM})
+@String(ACMMM = {ACM MM})
+@String(ICME  =	{ICME})
+@String(ICASSP=	{ICASSP})
+@String(ICIP  = {ICIP})
+@String(ACCV  = {ACCV})
+@String(ICLR  = {ICLR})
+@String(IJCAI = {IJCAI})
+@String(PR = {PR})
+@String(AAAI = {AAAI})
+@String(CVPRW= {CVPRW})
+@String(CSVT = {IEEE TCSVT})
+@inproceedings{resnet,
+  title={Deep residual learning for image recognition},
+  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={770--778},
+  year={2016}
+}
+@inproceedings{deeplabv3p,
+  title={Encoder-decoder with atrous separable convolution for semantic image segmentation},
+  author={Chen, Liang-Chieh and Zhu, Yukun and Papandreou, George and Schroff, Florian and Adam, Hartwig},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={801--818},
+  year={2018}
+}
+@article{deeplabv3,
+  title={Rethinking atrous convolution for semantic image segmentation},
+  author={Chen, Liang-Chieh and Papandreou, George and Schroff, Florian and Adam, Hartwig},
+  journal={arXiv preprint arXiv:1706.05587},
+  year={2017}
+}
+@article{deeplab,
+  title={Deeplab: Semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected crfs},
+  author={Chen, Liang-Chieh and Papandreou, George and Kokkinos, Iasonas and Murphy, Kevin and Yuille, Alan L},
+  journal={IEEE transactions on pattern analysis and machine intelligence},
+  volume={40},
+  number={4},
+  pages={834--848},
+  year={2017},
+  publisher={IEEE}
+}
+@inproceedings{pspnet,
+  title={Pyramid scene parsing network},
+  author={Zhao, Hengshuang and Shi, Jianping and Qi, Xiaojuan and Wang, Xiaogang and Jia, Jiaya},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={2881--2890},
+  year={2017}
+}
+@inproceedings{ccnet,
+  title={Ccnet: Criss-cross attention for semantic segmentation},
+  author={Huang, Zilong and Wang, Xinggang and Huang, Lichao and Huang, Chang and Wei, Yunchao and Liu, Wenyu},
+  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
+  pages={603--612},
+  year={2019}
+}
+@inproceedings{attention,
+  title={Attention is all you need},
+  author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
+  booktitle={Advances in neural information processing systems},
+  pages={5998--6008},
+  year={2017}
+}
+@article{vit,
+  title={An image is worth 16x16 words: Transformers for image recognition at scale},
+  author={Dosovitskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and others},
+  journal={arXiv preprint arXiv:2010.11929},
+  year={2020}
+}
+@article{swin,
+  title={Swin transformer: Hierarchical vision transformer using shifted windows},
+  author={Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining},
+  journal={arXiv preprint arXiv:2103.14030},
+  year={2021}
+}
+@inproceedings{dpt,
+  title={Vision transformers for dense prediction},
+  author={Ranftl, Ren{\'e} and Bochkovskiy, Alexey and Koltun, Vladlen},
+  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
+  pages={12179--12188},
+  year={2021}
+}
+@inproceedings{setr,
+  title={Rethinking semantic segmentation from a sequence-to-sequence perspective with transformers},
+  author={Zheng, Sixiao and Lu, Jiachen and Zhao, Hengshuang and Zhu, Xiatian and Luo, Zekun and Wang, Yabiao and Fu, Yanwei and Feng, Jianfeng and Xiang, Tao and Torr, Philip HS and others},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={6881--6890},
+  year={2021}
+}
+@article{segformer,
+  title={SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers},
+  author={Xie, Enze and Wang, Wenhai and Yu, Zhiding and Anandkumar, Anima and Alvarez, Jose M and Luo, Ping},
+  journal={arXiv preprint arXiv:2105.15203},
+  year={2021}
+}
+@inproceedings{glnet,
+  title={Collaborative global-local networks for memory-efficient segmentation of ultra-high resolution images},
+  author={Chen, Wuyang and Jiang, Ziyu and Wang, Zhangyang and Cui, Kexin and Qian, Xiaoning},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={8924--8933},
+  year={2019}
+}
+@inproceedings{cascadepsp,
+  title={CascadePSP: toward class-agnostic and very high-resolution segmentation via global and local refinement},
+  author={Cheng, Ho Kei and Chung, Jihoon and Tai, Yu-Wing and Tang, Chi-Keung},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={8890--8899},
+  year={2020}
+}
+@inproceedings{magnet,
+  title={Progressive Semantic Segmentation},
+  author={Huynh, Chuong and Tran, Anh Tuan and Luu, Khoa and Hoai, Minh},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={16755--16764},
+  year={2021}
+}
+@article{cable_inst,
+  title={Transmission line detection in aerial images: An instance segmentation approach based on multitask neural networks},
+  author={Li, Bo and Chen, Cheng and Dong, Shiwen and Qiao, Junfeng},
+  journal={Signal Processing: Image Communication},
+  volume={96},
+  pages={116278},
+  year={2021},
+  publisher={Elsevier}
+}
+@article{pldu,
+  title={Detecting power lines in UAV images with convolutional features and structured constraints},
+  author={Zhang, Heng and Yang, Wen and Yu, Huai and Zhang, Haijian and Xia, Gui-Song},
+  journal={Remote Sensing},
+  volume={11},
+  number={11},
+  pages={1342},
+  year={2019},
+  publisher={Multidisciplinary Digital Publishing Institute}
+}
+@inproceedings{ttpla,
+  title={TTPLA: An Aerial-Image Dataset for Detection and Segmentation of Transmission Towers and Power Lines},
+  author={Abdelfattah, Rabab and Wang, Xiaofeng and Wang, Song},
+  booktitle={Proceedings of the Asian Conference on Computer Vision},
+  year={2020}
+}
+@article{powerlinedataset,
+  title={Power image dataset (infrared-ir and visible light-vl)},
+  author={Yetgin, {\"O}mer Emre and Gerek, {\"O}mer Nezih and Nezih, {\"O}mer},
+  journal={Mendeley Data},
+  volume={8},
+  year={2017}
+}
+@inproceedings{cityscapes,
+  title={The cityscapes dataset for semantic urban scene understanding},
+  author={Cordts, Marius and Omran, Mohamed and Ramos, Sebastian and Rehfeld, Timo and Enzweiler, Markus and Benenson, Rodrigo and Franke, Uwe and Roth, Stefan and Schiele, Bernt},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={3213--3223},
+  year={2016}
+}
+@inproceedings{mapillary,
+  title={The mapillary vistas dataset for semantic understanding of street scenes},
+  author={Neuhold, Gerhard and Ollmann, Tobias and Rota Bulo, Samuel and Kontschieder, Peter},
+  booktitle={Proceedings of the IEEE international conference on computer vision},
+  pages={4990--4999},
+  year={2017}
+}
+@inproceedings{coco,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={European conference on computer vision},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+@inproceedings{3drecon,
+  title={SAIL-VOS 3D: A Synthetic Dataset and Baselines for Object Detection and 3D Mesh Reconstruction from Video Data},
+  author={Hu, Yuan-Ting and Wang, Jiahong and Yeh, Raymond A and Schwing, Alexander G},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={1418--1428},
+  year={2021}
+}
+@inproceedings{domainadapt,
+  title={Differential treatment for stuff and things: A simple unsupervised domain adaptation method for semantic segmentation},
+  author={Wang, Zhonghao and Yu, Mo and Wei, Yunchao and Feris, Rogerio and Xiong, Jinjun and Hwu, Wen-mei and Huang, Thomas S and Shi, Honghui},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={12635--12644},
+  year={2020}
+}
+@article{lsnet,
+  title={Ls-net: Fast single-shot line-segment detector},
+  author={Nguyen, Van Nhan and Jenssen, Robert and Roverso, Davide},
+  journal={arXiv preprint arXiv:1912.09532},
+  year={2019}
+}
+@inproceedings{syntheticwire,
+  title={Wire detection using synthetic data and dilated convolutional networks for unmanned aerial vehicles},
+  author={Madaan, Ratnesh and Maturana, Daniel and Scherer, Sebastian},
+  booktitle={2017 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)},
+  pages={3487--3494},
+  year={2017},
+  organization={IEEE}
+}
+@inproceedings{lanedet,
+  title={Keep your eyes on the lane: Real-time attention-guided lane detection},
+  author={Tabelini, Lucas and Berriel, Rodrigo and Paixao, Thiago M and Badue, Claudine and De Souza, Alberto F and Oliveira-Santos, Thiago},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={294--302},
+  year={2021}
+}
+@article{swiftlane,
+  title={SwiftLane: Towards Fast and Efficient Lane Detection},
+  author={Jayasinghe, Oshada and Anhettigama, Damith and Hemachandra, Sahan and Kariyawasam, Shenali and Rodrigo, Ranga and Jayasekara, Peshala},
+  journal={arXiv preprint arXiv:2110.11779},
+  year={2021}
+}
+@article{structurelane,
+  title={Structure Guided Lane Detection},
+  author={Su, Jinming and Chen, Chao and Zhang, Ke and Luo, Junfeng and Wei, Xiaoming and Wei, Xiaolin},
+  journal={arXiv preprint arXiv:2105.05403},
+  year={2021}
+}
+@inproceedings{linetransformer,
+  title={Line segment detection using transformers without edges},
+  author={Xu, Yifan and Xu, Weijian and Cheung, David and Tu, Zhuowen},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={4257--4266},
+  year={2021}
+}
+@inproceedings{linegraph,
+  title={Ppgnet: Learning point-pair graph for line segment detection},
+  author={Zhang, Ziheng and Li, Zhengxin and Bi, Ning and Zheng, Jia and Wang, Jinlei and Huang, Kun and Luo, Weixin and Xu, Yanyu and Gao, Shenghua},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={7105--7114},
+  year={2019}
+}
+@inproceedings{focal,
+  title={Focal loss for dense object detection},
+  author={Lin, Tsung-Yi and Goyal, Priya and Girshick, Ross and He, Kaiming and Doll{\'a}r, Piotr},
+  booktitle={Proceedings of the IEEE international conference on computer vision},
+  pages={2980--2988},
+  year={2017}
+}
+@inproceedings{ohem,
+  title={Training region-based object detectors with online hard example mining},
+  author={Shrivastava, Abhinav and Gupta, Abhinav and Girshick, Ross},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={761--769},
+  year={2016}
+}
+@article{adamw,
+  title={Decoupled weight decay regularization},
+  author={Loshchilov, Ilya and Hutter, Frank},
+  journal={arXiv preprint arXiv:1711.05101},
+  year={2017}
+}
+@article{barnes2009patchmatch,
+  title={PatchMatch: A randomized correspondence algorithm for structural image editing},
+  author={Barnes, Connelly and Shechtman, Eli and Finkelstein, Adam and Goldman, Dan B},
+  journal={ACM Trans. Graph.},
+  volume={28},
+  number={3},
+  pages={24},
+  year={2009}
+}
+@inproceedings{inria,
+  title={Can Semantic Labeling Methods Generalize to Any City? The Inria Aerial Image Labeling Benchmark},
+  author={Maggiori, Emmanuel and Tarabalka, Yuliya and Charpiat, Guillaume and Alliez, Pierre},
+  booktitle={IEEE International Geoscience and Remote Sensing Symposium (IGARSS)},
+  year={2017},
+  organization={IEEE}
+}
+@misc{Pixel6as49,
+author = {},
+title = {Pixel 6, a smarter chip for a smarter phone - Google Store},
+howpublished = {\url{https://store.google.com/product/pixel_6?hl=en-US}},
+month = {},
+year = {},
+note = {(Accessed on 11/14/2021)}
+}
+@inproceedings{isdnet,
+  title={ISDNet: Integrating Shallow and Deep Networks for Efficient Ultra-High Resolution Segmentation},
+  author={Guo, Shaohua and Liu, Liang and Gan, Zhenye and Wang, Yabiao and Zhang, Wuhao and Wang, Chengjie and Jiang, Guannan and Zhang, Wei and Yi, Ran and Ma, Lizhuang and others},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={4361--4370},
+  year={2022}
+}
+@inproceedings{suvorov2022resolution,
+  title={Resolution-robust large mask inpainting with fourier convolutions},
+  author={Suvorov, Roman and Logacheva, Elizaveta and Mashikhin, Anton and Remizova, Anastasia and Ashukha, Arsenii and Silvestrov, Aleksei and Kong, Naejin and Goka, Harshith and Park, Kiwoong and Lempitsky, Victor},
+  booktitle={Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision},
+  pages={2149--2159},
+  year={2022}
+}
+@inproceedings{deepglobe,
+  title={Deepglobe 2018: A challenge to parse the earth through satellite images},
+  author={Demir, Ilke and Koperski, Krzysztof and Lindenbaum, David and Pang, Guan and Huang, Jing and Basu, Saikat and Hughes, Forest and Tuia, Devis and Raskar, Ramesh},
+  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops},
+  pages={172--181},
+  year={2018}
+}
+@article{learning_downsample,
+    author = {Jin, Chen
+              and Tanno, Ryutaro
+              and Mertzanidou, Thomy
+              and Panagiotaki, Eleftheria
+              and Alexander, Daniel C},
+    title = {{Learning to Downsample for Segmentation of Ultra-High Resolution Images}},
+    journal = {arXiv preprint arXiv:2109.11071},
+    year  = {2021}
+}
+@inproceedings{stdc,
+  title={Rethinking BiSeNet for real-time semantic segmentation},
+  author={Fan, Mingyuan and Lai, Shenqi and Huang, Junshi and Wei, Xiaoming and Chai, Zhenhua and Luo, Junfeng and Wei, Xiaolin},
+  booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
+  pages={9716--9725},
+  year={2021}
+}
+@article{zheng2022cm,
+  title={CM-GAN: Image Inpainting with Cascaded Modulation GAN and Object-Aware Training},
+  author={Zheng, Haitian and Lin, Zhe and Lu, Jingwan and Cohen, Scott and Shechtman, Eli and Barnes, Connelly and Zhang, Jianming and Xu, Ning and Amirghodsi, Sohrab and Luo, Jiebo},
+  journal={arXiv preprint arXiv:2203.11947},
+  year={2022}
+}
+@article{jain2022keys,
+  title={Keys to Better Image Inpainting: Structure and Texture Go Hand in Hand},
+  author={Jain, Jitesh and Zhou, Yuqian and Yu, Ning and Shi, Humphrey},
+  journal={arXiv preprint arXiv:2208.03382},
+  year={2022}
+}
+@inproceedings{rombach2022high,
+  title={High-resolution image synthesis with latent diffusion models},
+  author={Rombach, Robin and Blattmann, Andreas and Lorenz, Dominik and Esser, Patrick and Ommer, Bj{\"o}rn},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={10684--10695},
+  year={2022}
+}
+@inproceedings{yu2019free,
+  title={Free-form image inpainting with gated convolution},
+  author={Yu, Jiahui and Lin, Zhe and Yang, Jimei and Shen, Xiaohui and Lu, Xin and Huang, Thomas S},
+  booktitle={Proceedings of the IEEE/CVF international conference on computer vision},
+  pages={4471--4480},
+  year={2019}
+}
+@article{zhou2017places,
+  title={Places: A 10 million Image Database for Scene Recognition},
+  author={Zhou, Bolei and Lapedriza, Agata and Khosla, Aditya and Oliva, Aude and Torralba, Antonio},
+  journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
+  year={2017},
+  publisher={IEEE}
+}
+@inproceedings{karras2020analyzing,
+  title={Analyzing and improving the image quality of stylegan},
+  author={Karras, Tero and Laine, Samuli and Aittala, Miika and Hellsten, Janne and Lehtinen, Jaakko and Aila, Timo},
+  booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
+  pages={8110--8119},
+  year={2020}
+}
+@article{wexler2007space,
+  title={Space-time completion of video},
+  author={Wexler, Yonatan and Shechtman, Eli and Irani, Michal},
+  journal={IEEE Transactions on pattern analysis and machine intelligence},
+  volume={29},
+  number={3},
+  pages={463--476},
+  year={2007},
+  publisher={IEEE}
+}
+@article{darabi2012image,
+  title={Image melding: Combining inconsistent images using patch-based synthesis},
+  author={Darabi, Soheil and Shechtman, Eli and Barnes, Connelly and Goldman, Dan B and Sen, Pradeep},
+  journal={ACM Transactions on graphics (TOG)},
+  volume={31},
+  number={4},
+  pages={1--10},
+  year={2012},
+  publisher={ACM New York, NY, USA}
+}
+@inproceedings{kaspar2015self,
+  title={Self tuning texture optimization},
+  author={Kaspar, Alexandre and Neubert, Boris and Lischinski, Dani and Pauly, Mark and Kopf, Johannes},
+  booktitle={Computer Graphics Forum},
+  volume={34},
+  number={2},
+  pages={349--359},
+  year={2015},
+  organization={Wiley Online Library}
+}
+@inproceedings{contextencoder,
+  title={Context encoders: Feature learning by inpainting},
+  author={Pathak, Deepak and Krahenbuhl, Philipp and Donahue, Jeff and Darrell, Trevor and Efros, Alexei A},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={2536--2544},
+  year={2016}
+}
+@article{globallocal,
+  title={Globally and locally consistent image completion},
+  author={Iizuka, Satoshi and Simo-Serra, Edgar and Ishikawa, Hiroshi},
+  journal={ACM Transactions on Graphics (ToG)},
+  volume={36},
+  number={4},
+  pages={1--14},
+  year={2017},
+  publisher={ACM New York, NY, USA}
+}
+@inproceedings{contextual,
+  title={Generative image inpainting with contextual attention},
+  author={Yu, Jiahui and Lin, Zhe and Yang, Jimei and Shen, Xiaohui and Lu, Xin and Huang, Thomas S},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={5505--5514},
+  year={2018}
+}
+@inproceedings{partialconv,
+  title={Image inpainting for irregular holes using partial convolutions},
+  author={Liu, Guilin and Reda, Fitsum A and Shih, Kevin J and Wang, Ting-Chun and Tao, Andrew and Catanzaro, Bryan},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={85--100},
+  year={2018}
+}
+@article{comodgan,
+  title={Large scale image completion via co-modulated generative adversarial networks},
+  author={Zhao, Shengyu and Cui, Jonathan and Sheng, Yilun and Dong, Yue and Liang, Xiao and Chang, Eric I and Xu, Yan},
+  journal={arXiv preprint arXiv:2103.10428},
+  year={2021}
+}
+@article{dalle,
+  title={Hierarchical text-conditional image generation with clip latents},
+  author={Ramesh, Aditya and Dhariwal, Prafulla and Nichol, Alex and Chu, Casey and Chen, Mark},
+  journal={arXiv preprint arXiv:2204.06125},
+  year={2022}
+}
+@inproceedings{zeng2020high,
+  title={High-resolution image inpainting with iterative confidence feedback and guided upsampling},
+  author={Zeng, Yu and Lin, Zhe and Yang, Jimei and Zhang, Jianming and Shechtman, Eli and Lu, Huchuan},
+  booktitle={European conference on computer vision},
+  pages={1--17},
+  year={2020},
+  organization={Springer}
+}
+@inproceedings{hifill,
+  title={Contextual Residual Aggregation for Ultra High-Resolution Image Inpainting},
+  author={Yi, Zili and Tang, Qiang and Azizi, Shekoofeh and Jang, Daesik and Xu, Zhan},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={7508--7517},
+  year={2020}
+}
+@inproceedings{supercaf,
+  title={Inpainting at Modern Camera Resolution by Guided PatchMatch with Auto-curation},
+  author={Zhang, Lingzhi and Barnes, Connelly and Wampler, Kevin and Amirghodsi, Sohrab and Shechtman, Eli and Lin, Zhe and Shi, Jianbo},
+  booktitle={European Conference on Computer Vision},
+  pages={51--67},
+  year={2022},
+  organization={Springer}
+}
+@article{hassani2022neighborhood,
+	title        = {Neighborhood Attention Transformer},
+	author       = {Ali Hassani and Steven Walton and Jiachen Li and Shen Li and Humphrey Shi},
+	year         = 2022,
+	url          = {https://arxiv.org/abs/2204.07143},
+	eprint       = {2204.07143},
+	archiveprefix = {arXiv},
+	primaryclass = {cs.CV}
+}
+@article{hassani2022dilated,
+	title        = {Dilated Neighborhood Attention Transformer},
+	author       = {Ali Hassani and Humphrey Shi},
+	year         = 2022,
+	url          = {https://arxiv.org/abs/2209.15001},
+	eprint       = {2209.15001},
+	archiveprefix = {arXiv},
+	primaryclass = {cs.CV}
+}
+@article{jain2021semask,
+  title={SeMask: Semantically Masking Transformer Backbones for Effective Semantic Segmentation},
+  author={Jitesh Jain and Anukriti Singh and Nikita Orlov and Zilong Huang and Jiachen Li and Steven Walton and Humphrey Shi},
+  journal={arXiv},
+  year={2021}
+}
+@inproceedings{jain2022oneformer,
+      title={{OneFormer: One Transformer to Rule Universal Image Segmentation}},
+      author={Jitesh Jain and Jiachen Li and MangTik Chiu and Ali Hassani and Nikita Orlov and Humphrey Shi},
+      journal={CVPR},
+      year={2023}
+    }
+@inproceedings{xu2022image,
+  title={Image Completion with Heterogeneously Filtered Spectral Hints},
+  author={Xingqian Xu and Shant Navasardyan and Vahram Tadevosyan and Andranik Sargsyan and Yadong Mu and Humphrey Shi},
+  booktitle={WACV},
+  year={2023}
+}

paper-tex/figure_tex/additional_visualizations.tex ADDED Viewed

	@@ -0,0 +1,8 @@

+%auto-ignore
+\begin{figure*}[t!]
+    \setlength{\abovecaptionskip}{1mm}
+    \centering
+    \includegraphics[width=\textwidth]{figures/additional_visualizations.pdf}
+    \caption{\textbf{Segmentation and inpainting visualizations}. Our model can handle several challenging scenes, including strongly backlit (top row), background with complex texture (2nd row), low light (3rd row), and barely visible wires (4th row)}
+    \label{fig:additional_visualizations}
+\end{figure*}

paper-tex/figure_tex/annotation.tex ADDED Viewed

	@@ -0,0 +1,11 @@

+%auto-ignore
+\begin{figure}[t!]
+    \centering
+    \captionsetup{type=figure}
+    \includegraphics[width=\linewidth]{figures/annotation.pdf}\\
+    \vspace{-1mm}
+    \captionof{figure}{\textbf{Wire Annotation Example.} An example wire annotation in our dataset. Our annotation (B) is accurate in different wire thicknesses (\textcolor{red}{red}), variations in wire shapes (\textcolor{orange}{orange}) and accurate wire occlusions (\textcolor{yellow}{yellow}).}
+    % \textcolor{red}{This image is unchanged from last year, do we want to say this is one of the collected images rather than saying it's in the benchmark dataset?}
+\vspace{-4mm}
+    \label{fig:annotation}
+\end{figure}

paper-tex/figure_tex/comparison.tex ADDED Viewed

	@@ -0,0 +1,27 @@

+%auto-ignore
+\begin{figure*}[t!]
+\centering
+% \vspace{1mm}
+\includegraphics[width=1.0\linewidth]{figures/wire-qualitative.pdf}
+\vspace{-8mm}
+\caption{
+% Qualitative comparison between our method and baseline networks on a 32 Megapixel image. (a), (b): Whole image inference. (c), (d): Sliding window inference. (e), (f): Our two-stage model. Our model predicts much tighter segmentation masks without artifacts such as aliasing, while avoiding false positives caused by only seeing local regions. Each R/G/B bounding box in (c),(d),(e) represents a 1024$\times$1024 sliding-window used by the fine module for refinement.
+\textbf{Qualitative comparison of several semantic segmentation models.} %Our model predicts much tighter and consistent segmentation masks, while avoiding false positives caused by only seeing local regions. Note that in the third row, our two-stage model successfully suppresses false positives that would otherwise be misclassified without global context.
+A common object semantic segmentation model (DeepLabv3+) either fails to find thin wires or overpredicts due to lack of global context. On the other hand, CascadePSP and MagNet, being refinement-based models, cannot work well on wires when the predictions are inaccurate or missing. While ISDNet can capture many thin wires regions, it cannot produce a high-quality prediction. In contrast, our model is able to both capture accurate wire regions and produce fine wire masks, and maintain low inference time.
+\vspace{-5mm}
+}
+\label{fig:visual}
+\end{figure*}
+\begin{figure*}[t!]
+\centering
+\vspace{1mm}
+\includegraphics[width=1.0\linewidth]{figures/wire-ablation.pdf}
+\vspace{-7mm}
+\caption{
+\textbf{Qualitative comparison of our model components.} MinMax enhances wire image features when they are too subtle to see in RGB, while MaxPool encourages aggressive predictions in the coarse branch. Both components enable the model to pick up more regions for the final wire mask prediction.
+}
+% \vspace{-2mm}
+\label{fig:visual}
+\end{figure*}

paper-tex/figure_tex/failure.tex ADDED Viewed

	@@ -0,0 +1,11 @@

+%auto-ignore
+\begin{figure}[h!]
+\centering
+\captionsetup{type=figure}
+\includegraphics[width=1.\linewidth]{figures/failure.pdf}
+\vspace{-6mm}
+\captionof{figure}{\textbf{Failure cases}. Our model can fail for cases where wires are heavily blended with the background (upper row), or on thick wires with complex patterns that are rarely seen.
+\vspace{-5mm}
+}
+\label{fig:failure}
+\end{figure}

paper-tex/figure_tex/inpaint_compare.tex ADDED Viewed

	@@ -0,0 +1,12 @@

+%auto-ignore
+\begin{figure}[h!]
+\centering
+\captionsetup{type=figure}
+\includegraphics[width=1.\linewidth]{figures/inpainting_result.pdf}
+\vspace{-6mm}
+\captionof{figure}{\textbf{Inpainting Comparison}. Our model performs well on complicated structure completion and color consistency, especially on building facades and sky regions containing plain and uniform color.
+\vspace{-3mm}
+}
+\label{fig:wire_inp}
+\end{figure}

paper-tex/figure_tex/motivation.tex ADDED Viewed

	@@ -0,0 +1,11 @@

+%auto-ignore
+\begin{figure}[t!]
+\centering
+\includegraphics[width=1.0\linewidth]{figures/wireishard.pdf}
+% \includegraphics[width=1.0\linewidth]{figures/wiresarehard2.pdf}
+\caption{\textbf{Challenges of wire segmentation.} Wires have a diverse set of appearances. Challenges include but are not limited to (a) structural complexity, (b) visibility and thickness, (c) partial occlusion by other objects, (d) camera aberration artifacts, and variations in (e) object attachment, (f) color, (g) width and (h) shape.
+% \zwei{this needs to be correspondent to the attributes you mentioned}
+}
+\vspace{-5.5mm}
+\label{fig:motivation}
+\end{figure}

paper-tex/figure_tex/new_failure_cases.tex ADDED Viewed

	@@ -0,0 +1,9 @@

+%auto-ignore
+\begin{figure}[h!]
+    \setlength{\abovecaptionskip}{1mm}
+    \centering
+    \includegraphics[width=0.9\linewidth]{figures/new_failures.pdf}
+    \caption{\textbf{Failure cases}. In some challenging cases, our model fails to predict accurate masks. Zoom in to see detailed wire masks in ground truth and prediction.}
+    \label{fig:new_failures}
+    \vspace{-10mm}
+\end{figure}

paper-tex/figure_tex/new_panorama.tex ADDED Viewed

	@@ -0,0 +1,8 @@

+%auto-ignore
+\begin{figure*}[t!]
+\centering
+\includegraphics[width=1.0\textwidth]{figures/new_panorama.pdf}
+\caption{\textbf{Segmentation and inpainting result for a panoramic image.} Our model is scalable to very large images with very thin wires.}
+\label{fig:new_panorama}
+\vspace{-3mm}
+\end{figure*}

paper-tex/figure_tex/overpredict.tex ADDED Viewed

	@@ -0,0 +1,14 @@

+%auto-ignore
+\begin{figure}[h!]
+    \centering
+    \begin{tabular}{@{}c@{\hspace{1mm}}c@{\hspace{1mm}}c@{}}
+    \captionsetup{type=figure}
+    \includegraphics[width=0.9\linewidth]{figures/overpredict/02709_global.jpg} \\
+    (a) Fine module output with global logit map \\
+    and binary location map. \\
+    \includegraphics[width=0.9\linewidth]{figures/overpredict/02709_local.jpg} \\
+    (b) Fine module output with only local logit map. \\
+    \end{tabular}
+    \captionof{figure}{(Image to be updated)}
+    \label{fig:overpredict}
+\end{figure}

paper-tex/figure_tex/panorama.tex ADDED Viewed

	@@ -0,0 +1,9 @@

+%auto-ignore
+\begin{figure*}[t!]
+\centering
+\includegraphics[width=1.0\textwidth]{figures/panorama.pdf}
+\vspace{-7mm}
+\caption{\textbf{Segmentation result for a panoramic image.} Our two-stage model leverages the sparsity of wires in natural images, and efficiently generalizes to ultra-high resolution images such as this panorama of $10$K by $2$K resolution. Note that our method produces high quality wire segmentation that covers wires that are almost invisible.}
+\vspace{-4mm}
+\label{fig:panorama}
+\end{figure*}

paper-tex/figure_tex/pipeline.tex ADDED Viewed

	@@ -0,0 +1,9 @@

+%auto-ignore
+\begin{figure*}[hbt!]
+\centering
+\includegraphics[width=1.0\textwidth]{figures/wire-pipeline.png}
+\vspace{-2mm}
+\caption{\textbf{Our wire removal system}. A system overview of our wire segmentation and removal for high resolution images. Input is concatenated with min- and max-filtered luminance channels. The downsampled input is fed into the coarse module to obtain the global probability. In the local stage, original-resolution patches are concatenated with the global probability map to obtain the local logit map. After a segmentation mask is predicted, we adopt LaMa architecture and use a tile-based approach to achieve wire removal. See Section~\ref{sec:segmentation},~\ref{sec:inpainting} for details.}
+\vspace{-1mm}
+\label{fig:pipeline}
+\end{figure*}

paper-tex/figure_tex/pixel6.tex ADDED Viewed

	@@ -0,0 +1,10 @@

+%auto-ignore
+\begin{figure}[h!]
+    \setlength{\abovecaptionskip}{1mm}
+    \centering
+    \captionsetup{type=figure}
+    \includegraphics[width=0.43\textwidth]{figures/wire-pixel6.pdf}
+    \vspace{-1mm}
+    \captionof{figure}{\textbf{Comparison with Pixel 6}. Our model can pick up hardly visible wires that even in complicated backgrounds}
+    \label{fig:pixel6}
+\end{figure}

paper-tex/figure_tex/teaser.tex ADDED Viewed

	@@ -0,0 +1,16 @@

+%auto-ignore
+\twocolumn[{%
+\renewcommand\twocolumn[1][]{#1}%
+\begin{center}
+\maketitle
+    \centering
+    \begin{tabular}{@{}c@{\hspace{1mm}}c@{\hspace{1mm}}c@{}}
+    \captionsetup{type=figure}
+    \includegraphics[width=0.5\linewidth]{figures/teaser/Picture2.jpg} &
+    \includegraphics[width=0.5\linewidth]{figures/teaser/Picture4.jpg} \\
+    Input Image & Our Wire Segmentation Result \\
+    % \includegraphics[width=0.33\linewidth]{figures/teaser/1_caf.png} \\
+    \end{tabular}
+    \captionof{figure}{\textbf{Our wire-like object segmentation result}. The input image (left) is a 12 Megapixel image taken by a smartphone. Our wire-like object segmentation shows high quality result (right) despite the complexity and variation of wires in the input image. (All figures in this paper are best viewed in full resolution on a big screen.)}
+\end{center}%
+}]

paper-tex/figures/additional_visualizations.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c54751212b9a552233f3108ef66ad62464edd4777f23d59e41ecbeae2f09fc7
+size 826719

paper-tex/figures/annotation.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:06c9b1a5acf8058b220e41e16a7fb1deabbcd27a2066e715cad7386bec976035
+size 201747

paper-tex/figures/inpainting_result.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8cfd220fae94816922173c8ab3d46c7b46601f31059f9490ca18e228502094fa
+size 4586003

paper-tex/figures/new_failures.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b27c6a579f3029e483f8ee6ebeae281f330f7567fa63271fcaf8862e5971602
+size 121316

paper-tex/figures/new_panorama.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe64a9b8f05d7fdd0eb5e0b4716d3e0c50009b75d7f960b9221cf71306362fc2
+size 865378

paper-tex/figures/panorama.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:571daeb6fe856e769e3b0911e63207dca2ab7ec7800a9b9e071ecee60e0df9a2
+size 2475508

paper-tex/figures/pixel6.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:65360277495b27fc7bba97c7c23b2abe4f3bc56dfe7edef18b70710103fc3532
+size 2317563

paper-tex/figures/qualitative.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3da1671ec43dbc8734c06f15c573b39b34eef7ba71da3e3deef8293f13181149
+size 1232647

paper-tex/figures/teaser2.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0479a842e86f66c4b3a2963fe9c9f23885fa6a6c6dc7a8e1bbe056e054d2c59
+size 8619642

paper-tex/figures/wire-ablation.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d6b302521df024fa15cd61470eb367507e8fd1f97adb6cee9b18f8882596c81a
+size 353551

paper-tex/figures/wire-pipeline.png ADDED Viewed

Git LFS Details

SHA256: 23fcc646a6ef29043eb78f0ec3c45ae145e7643dfbdb7d89089e4bcd13927b4a
Pointer size: 132 Bytes
Size of remote file: 1.23 MB

paper-tex/figures/wire-pixel6.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:094445913469d368179893224d55e60ca5c4ad21d8506aa495e0b5734c1abe6b
+size 10657467

paper-tex/figures/wire-qualitative.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5484ce337ed151049fd89f9a05f190f22cda15c97ee4d3911312752c80f5f4a1
+size 588630

paper-tex/figures/wireishard.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b8153a4753f1c270f06855f724a2d0ebe8e026cf4a035928ae5cc666f5fef6f
+size 86567

paper-tex/ieee_fullname.bst ADDED Viewed

	@@ -0,0 +1,1135 @@

+% This is a modification to the normal ieee.bst used by CVPR to render
+% first names in the bibliography as "Firstname Lastname" rather than
+% "F. Lastname".
+%
+% Jonathan T. Barron, 12/5/2018, [email protected]
+% ---------------------------------------------------------------
+%
+% ieee.bst,v 1.0 2002/04/16
+%
+% by Glenn Paulley ([email protected])
+%
+% Modified from latex8.bst 1995/09/15 15:13:49 ienne Exp $
+%
+% by [email protected]
+%
+%
+% ---------------------------------------------------------------
+%
+% no guarantee is given that the format corresponds perfectly to
+% IEEE 8.5" x 11" Proceedings, but most features should be ok.
+%
+% ---------------------------------------------------------------
+%
+% `ieee' from BibTeX standard bibliography style `abbrv'
+% version 0.99a for BibTeX versions 0.99a or later, LaTeX version 2.09.
+% Copyright (C) 1985, all rights reserved.
+% Copying of this file is authorized only if either
+% (1) you make absolutely no changes to your copy, including name, or
+% (2) if you do make changes, you name it something other than
+% btxbst.doc, plain.bst, unsrt.bst, alpha.bst, and abbrv.bst.
+% This restriction helps ensure that all standard styles are identical.
+% The file btxbst.doc has the documentation for this style.
+ENTRY
+  { address
+    author
+    booktitle
+    chapter
+    edition
+    editor
+    howpublished
+    institution
+    journal
+    key
+    month
+    note
+    number
+    organization
+    pages
+    publisher
+    school
+    series
+    title
+    type
+    volume
+    year
+  }
+  {}
+  { label }
+INTEGERS { output.state before.all mid.sentence after.sentence after.block }
+FUNCTION {init.state.consts}
+{ #0 'before.all :=
+  #1 'mid.sentence :=
+  #2 'after.sentence :=
+  #3 'after.block :=
+}
+STRINGS { s t }
+FUNCTION {output.nonnull}
+{ 's :=
+  output.state mid.sentence =
+    { ", " * write$ }
+    { output.state after.block =
+ { add.period$ write$
+   newline$
+   "\newblock " write$
+ }
+ { output.state before.all =
+     'write$
+     { add.period$ " " * write$ }
+   if$
+ }
+      if$
+      mid.sentence 'output.state :=
+    }
+  if$
+  s
+}
+FUNCTION {output}
+{ duplicate$ empty$
+    'pop$
+    'output.nonnull
+  if$
+}
+FUNCTION {output.check}
+{ 't :=
+  duplicate$ empty$
+    { pop$ "empty " t * " in " * cite$ * warning$ }
+    'output.nonnull
+  if$
+}
+FUNCTION {output.bibitem}
+{ newline$
+  "\bibitem{" write$
+  cite$ write$
+  "}" write$
+  newline$
+  ""
+  before.all 'output.state :=
+}
+FUNCTION {fin.entry}
+{ add.period$
+  write$
+  newline$
+}
+FUNCTION {new.block}
+{ output.state before.all =
+    'skip$
+    { after.block 'output.state := }
+  if$
+}
+FUNCTION {new.sentence}
+{ output.state after.block =
+    'skip$
+    { output.state before.all =
+ 'skip$
+ { after.sentence 'output.state := }
+      if$
+    }
+  if$
+}
+FUNCTION {not}
+{   { #0 }
+    { #1 }
+  if$
+}
+FUNCTION {and}
+{   'skip$
+    { pop$ #0 }
+  if$
+}
+FUNCTION {or}
+{   { pop$ #1 }
+    'skip$
+  if$
+}
+FUNCTION {new.block.checka}
+{ empty$
+    'skip$
+    'new.block
+  if$
+}
+FUNCTION {new.block.checkb}
+{ empty$
+  swap$ empty$
+  and
+    'skip$
+    'new.block
+  if$
+}
+FUNCTION {new.sentence.checka}
+{ empty$
+    'skip$
+    'new.sentence
+  if$
+}
+FUNCTION {new.sentence.checkb}
+{ empty$
+  swap$ empty$
+  and
+    'skip$
+    'new.sentence
+  if$
+}
+FUNCTION {field.or.null}
+{ duplicate$ empty$
+    { pop$ "" }
+    'skip$
+  if$
+}
+FUNCTION {emphasize}
+{ duplicate$ empty$
+    { pop$ "" }
+    { "{\em " swap$ * "}" * }
+  if$
+}
+INTEGERS { nameptr namesleft numnames }
+FUNCTION {format.names}
+{ 's :=
+  #1 'nameptr :=
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    % Formerly { s nameptr "{f.~}{vv~}{ll}{, jj}" format.name$ 't :=
+    { s nameptr "{ff }{vv }{ll}{, jj}" format.name$ 't :=
+      nameptr #1 >
+ { namesleft #1 >
+     { ", " * t * }
+     { numnames #2 >
+  { "," * }
+  'skip$
+       if$
+       t "others" =
+  { " et~al." * }
+  { " and " * t * }
+       if$
+     }
+   if$
+ }
+ 't
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+FUNCTION {format.authors}
+{ author empty$
+    { "" }
+    { author format.names }
+  if$
+}
+FUNCTION {format.editors}
+{ editor empty$
+    { "" }
+    { editor format.names
+      editor num.names$ #1 >
+ { ", editors" * }
+ { ", editor" * }
+      if$
+    }
+  if$
+}
+FUNCTION {format.title}
+{ title empty$
+    { "" }
+    { title "t" change.case$ }
+  if$
+}
+FUNCTION {n.dashify}
+{ 't :=
+  ""
+    { t empty$ not }
+    { t #1 #1 substring$ "-" =
+ { t #1 #2 substring$ "--" = not
+     { "--" *
+       t #2 global.max$ substring$ 't :=
+     }
+     {   { t #1 #1 substring$ "-" = }
+  { "-" *
+    t #2 global.max$ substring$ 't :=
+  }
+       while$
+     }
+   if$
+ }
+ { t #1 #1 substring$ *
+   t #2 global.max$ substring$ 't :=
+ }
+      if$
+    }
+  while$
+}
+FUNCTION {format.date}
+{ year empty$
+    { month empty$
+ { "" }
+ { "there's a month but no year in " cite$ * warning$
+   month
+ }
+      if$
+    }
+    { month empty$
+ 'year
+ { month " " * year * }
+      if$
+    }
+  if$
+}
+FUNCTION {format.btitle}
+{ title emphasize
+}
+FUNCTION {tie.or.space.connect}
+{ duplicate$ text.length$ #3 <
+    { "~" }
+    { " " }
+  if$
+  swap$ * *
+}
+FUNCTION {either.or.check}
+{ empty$
+    'pop$
+    { "can't use both " swap$ * " fields in " * cite$ * warning$ }
+  if$
+}
+FUNCTION {format.bvolume}
+{ volume empty$
+    { "" }
+    { "volume" volume tie.or.space.connect
+      series empty$
+ 'skip$
+ { " of " * series emphasize * }
+      if$
+      "volume and number" number either.or.check
+    }
+  if$
+}
+FUNCTION {format.number.series}
+{ volume empty$
+    { number empty$
+ { series field.or.null }
+ { output.state mid.sentence =
+     { "number" }
+     { "Number" }
+   if$
+   number tie.or.space.connect
+   series empty$
+     { "there's a number but no series in " cite$ * warning$ }
+     { " in " * series * }
+   if$
+ }
+      if$
+    }
+    { "" }
+  if$
+}
+FUNCTION {format.edition}
+{ edition empty$
+    { "" }
+    { output.state mid.sentence =
+ { edition "l" change.case$ " edition" * }
+ { edition "t" change.case$ " edition" * }
+      if$
+    }
+  if$
+}
+INTEGERS { multiresult }
+FUNCTION {multi.page.check}
+{ 't :=
+  #0 'multiresult :=
+    { multiresult not
+      t empty$ not
+      and
+    }
+    { t #1 #1 substring$
+      duplicate$ "-" =
+      swap$ duplicate$ "," =
+      swap$ "+" =
+      or or
+ { #1 'multiresult := }
+ { t #2 global.max$ substring$ 't := }
+      if$
+    }
+  while$
+  multiresult
+}
+FUNCTION {format.pages}
+{ pages empty$
+    { "" }
+    { pages multi.page.check
+ { "pages" pages n.dashify tie.or.space.connect }
+ { "page" pages tie.or.space.connect }
+      if$
+    }
+  if$
+}
+FUNCTION {format.vol.num.pages}
+{ volume field.or.null
+  number empty$
+    'skip$
+    { "(" number * ")" * *
+      volume empty$
+ { "there's a number but no volume in " cite$ * warning$ }
+ 'skip$
+      if$
+    }
+  if$
+  pages empty$
+    'skip$
+    { duplicate$ empty$
+ { pop$ format.pages }
+ { ":" * pages n.dashify * }
+      if$
+    }
+  if$
+}
+FUNCTION {format.chapter.pages}
+{ chapter empty$
+    'format.pages
+    { type empty$
+ { "chapter" }
+ { type "l" change.case$ }
+      if$
+      chapter tie.or.space.connect
+      pages empty$
+ 'skip$
+ { ", " * format.pages * }
+      if$
+    }
+  if$
+}
+FUNCTION {format.in.ed.booktitle}
+{ booktitle empty$
+    { "" }
+    { editor empty$
+ { "In " booktitle emphasize * }
+ { "In " format.editors * ", " * booktitle emphasize * }
+      if$
+    }
+  if$
+}
+FUNCTION {empty.misc.check}
+{ author empty$ title empty$ howpublished empty$
+  month empty$ year empty$ note empty$
+  and and and and and
+  key empty$ not and
+    { "all relevant fields are empty in " cite$ * warning$ }
+    'skip$
+  if$
+}
+FUNCTION {format.thesis.type}
+{ type empty$
+    'skip$
+    { pop$
+      type "t" change.case$
+    }
+  if$
+}
+FUNCTION {format.tr.number}
+{ type empty$
+    { "Technical Report" }
+    'type
+  if$
+  number empty$
+    { "t" change.case$ }
+    { number tie.or.space.connect }
+  if$
+}
+FUNCTION {format.article.crossref}
+{ key empty$
+    { journal empty$
+ { "need key or journal for " cite$ * " to crossref " * crossref *
+   warning$
+   ""
+ }
+ { "In {\em " journal * "\/}" * }
+      if$
+    }
+    { "In " key * }
+  if$
+  " \cite{" * crossref * "}" *
+}
+FUNCTION {format.crossref.editor}
+{ editor #1 "{vv~}{ll}" format.name$
+  editor num.names$ duplicate$
+  #2 >
+    { pop$ " et~al." * }
+    { #2 <
+ 'skip$
+ { editor #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" =
+     { " et~al." * }
+     { " and " * editor #2 "{vv~}{ll}" format.name$ * }
+   if$
+ }
+      if$
+    }
+  if$
+}
+FUNCTION {format.book.crossref}
+{ volume empty$
+    { "empty volume in " cite$ * "'s crossref of " * crossref * warning$
+      "In "
+    }
+    { "Volume" volume tie.or.space.connect
+      " of " *
+    }
+  if$
+  editor empty$
+  editor field.or.null author field.or.null =
+  or
+    { key empty$
+ { series empty$
+     { "need editor, key, or series for " cite$ * " to crossref " *
+       crossref * warning$
+       "" *
+     }
+     { "{\em " * series * "\/}" * }
+   if$
+ }
+ { key * }
+      if$
+    }
+    { format.crossref.editor * }
+  if$
+  " \cite{" * crossref * "}" *
+}
+FUNCTION {format.incoll.inproc.crossref}
+{ editor empty$
+  editor field.or.null author field.or.null =
+  or
+    { key empty$
+ { booktitle empty$
+     { "need editor, key, or booktitle for " cite$ * " to crossref " *
+       crossref * warning$
+       ""
+     }
+     { "In {\em " booktitle * "\/}" * }
+   if$
+ }
+ { "In " key * }
+      if$
+    }
+    { "In " format.crossref.editor * }
+  if$
+  " \cite{" * crossref * "}" *
+}
+FUNCTION {article}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { journal emphasize "journal" output.check
+      format.vol.num.pages output
+      format.date "year" output.check
+    }
+    { format.article.crossref output.nonnull
+      format.pages output
+    }
+  if$
+  new.block
+  note output
+  fin.entry
+}
+FUNCTION {book}
+{ output.bibitem
+  author empty$
+    { format.editors "author and editor" output.check }
+    { format.authors output.nonnull
+      crossref missing$
+ { "author and editor" editor either.or.check }
+ 'skip$
+      if$
+    }
+  if$
+  new.block
+  format.btitle "title" output.check
+  crossref missing$
+    { format.bvolume output
+      new.block
+      format.number.series output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+    }
+    { new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  format.edition output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+FUNCTION {booklet}
+{ output.bibitem
+  format.authors output
+  new.block
+  format.title "title" output.check
+  howpublished address new.block.checkb
+  howpublished output
+  address output
+  format.date output
+  new.block
+  note output
+  fin.entry
+}
+FUNCTION {inbook}
+{ output.bibitem
+  author empty$
+    { format.editors "author and editor" output.check }
+    { format.authors output.nonnull
+      crossref missing$
+ { "author and editor" editor either.or.check }
+ 'skip$
+      if$
+    }
+  if$
+  new.block
+  format.btitle "title" output.check
+  crossref missing$
+    { format.bvolume output
+      format.chapter.pages "chapter and pages" output.check
+      new.block
+      format.number.series output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+    }
+    { format.chapter.pages "chapter and pages" output.check
+      new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  format.edition output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+FUNCTION {incollection}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      format.bvolume output
+      format.number.series output
+      format.chapter.pages output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+      format.edition output
+      format.date "year" output.check
+    }
+    { format.incoll.inproc.crossref output.nonnull
+      format.chapter.pages output
+    }
+  if$
+  new.block
+  note output
+  fin.entry
+}
+FUNCTION {inproceedings}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      format.bvolume output
+      format.number.series output
+      format.pages output
+      address empty$
+ { organization publisher new.sentence.checkb
+   organization output
+   publisher output
+   format.date "year" output.check
+ }
+ { address output.nonnull
+   format.date "year" output.check
+   new.sentence
+   organization output
+   publisher output
+ }
+      if$
+    }
+    { format.incoll.inproc.crossref output.nonnull
+      format.pages output
+    }
+  if$
+  new.block
+  note output
+  fin.entry
+}
+FUNCTION {conference} { inproceedings }
+FUNCTION {manual}
+{ output.bibitem
+  author empty$
+    { organization empty$
+ 'skip$
+ { organization output.nonnull
+   address output
+ }
+      if$
+    }
+    { format.authors output.nonnull }
+  if$
+  new.block
+  format.btitle "title" output.check
+  author empty$
+    { organization empty$
+ { address new.block.checka
+   address output
+ }
+ 'skip$
+      if$
+    }
+    { organization address new.block.checkb
+      organization output
+      address output
+    }
+  if$
+  format.edition output
+  format.date output
+  new.block
+  note output
+  fin.entry
+}
+FUNCTION {mastersthesis}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  "Master's thesis" format.thesis.type output.nonnull
+  school "school" output.check
+  address output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+FUNCTION {misc}
+{ output.bibitem
+  format.authors output
+  title howpublished new.block.checkb
+  format.title output
+  howpublished new.block.checka
+  howpublished output
+  format.date output
+  new.block
+  note output
+  fin.entry
+  empty.misc.check
+}
+FUNCTION {phdthesis}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.btitle "title" output.check
+  new.block
+  "PhD thesis" format.thesis.type output.nonnull
+  school "school" output.check
+  address output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+FUNCTION {proceedings}
+{ output.bibitem
+  editor empty$
+    { organization output }
+    { format.editors output.nonnull }
+  if$
+  new.block
+  format.btitle "title" output.check
+  format.bvolume output
+  format.number.series output
+  address empty$
+    { editor empty$
+ { publisher new.sentence.checka }
+ { organization publisher new.sentence.checkb
+   organization output
+ }
+      if$
+      publisher output
+      format.date "year" output.check
+    }
+    { address output.nonnull
+      format.date "year" output.check
+      new.sentence
+      editor empty$
+ 'skip$
+ { organization output }
+      if$
+      publisher output
+    }
+  if$
+  new.block
+  note output
+  fin.entry
+}
+FUNCTION {techreport}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  format.tr.number output.nonnull
+  institution "institution" output.check
+  address output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+FUNCTION {unpublished}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  note "note" output.check
+  format.date output
+  fin.entry
+}
+FUNCTION {default.type} { misc }
+MACRO {jan} {"Jan."}
+MACRO {feb} {"Feb."}
+MACRO {mar} {"Mar."}
+MACRO {apr} {"Apr."}
+MACRO {may} {"May"}
+MACRO {jun} {"June"}
+MACRO {jul} {"July"}
+MACRO {aug} {"Aug."}
+MACRO {sep} {"Sept."}
+MACRO {oct} {"Oct."}
+MACRO {nov} {"Nov."}
+MACRO {dec} {"Dec."}
+MACRO {acmcs} {"ACM Comput. Surv."}
+MACRO {acta} {"Acta Inf."}
+MACRO {cacm} {"Commun. ACM"}
+MACRO {ibmjrd} {"IBM J. Res. Dev."}
+MACRO {ibmsj} {"IBM Syst.~J."}
+MACRO {ieeese} {"IEEE Trans. Softw. Eng."}
+MACRO {ieeetc} {"IEEE Trans. Comput."}
+MACRO {ieeetcad}
+ {"IEEE Trans. Comput.-Aided Design Integrated Circuits"}
+MACRO {ipl} {"Inf. Process. Lett."}
+MACRO {jacm} {"J.~ACM"}
+MACRO {jcss} {"J.~Comput. Syst. Sci."}
+MACRO {scp} {"Sci. Comput. Programming"}
+MACRO {sicomp} {"SIAM J. Comput."}
+MACRO {tocs} {"ACM Trans. Comput. Syst."}
+MACRO {tods} {"ACM Trans. Database Syst."}
+MACRO {tog} {"ACM Trans. Gr."}
+MACRO {toms} {"ACM Trans. Math. Softw."}
+MACRO {toois} {"ACM Trans. Office Inf. Syst."}
+MACRO {toplas} {"ACM Trans. Prog. Lang. Syst."}
+MACRO {tcs} {"Theoretical Comput. Sci."}
+READ
+FUNCTION {sortify}
+{ purify$
+  "l" change.case$
+}
+INTEGERS { len }
+FUNCTION {chop.word}
+{ 's :=
+  'len :=
+  s #1 len substring$ =
+    { s len #1 + global.max$ substring$ }
+    's
+  if$
+}
+FUNCTION {sort.format.names}
+{ 's :=
+  #1 'nameptr :=
+  ""
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { nameptr #1 >
+ { "   " * }
+ 'skip$
+      if$
+      s nameptr "{vv{ } }{ll{ }}{  f{ }}{  jj{ }}" format.name$ 't :=
+      nameptr numnames = t "others" = and
+ { "et al" * }
+ { t sortify * }
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+FUNCTION {sort.format.title}
+{ 't :=
+  "A " #2
+    "An " #3
+      "The " #4 t chop.word
+    chop.word
+  chop.word
+  sortify
+  #1 global.max$ substring$
+}
+FUNCTION {author.sort}
+{ author empty$
+    { key empty$
+ { "to sort, need author or key in " cite$ * warning$
+   ""
+ }
+ { key sortify }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+FUNCTION {author.editor.sort}
+{ author empty$
+    { editor empty$
+ { key empty$
+     { "to sort, need author, editor, or key in " cite$ * warning$
+       ""
+     }
+     { key sortify }
+   if$
+ }
+ { editor sort.format.names }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+FUNCTION {author.organization.sort}
+{ author empty$
+    { organization empty$
+ { key empty$
+     { "to sort, need author, organization, or key in " cite$ * warning$
+       ""
+     }
+     { key sortify }
+   if$
+ }
+ { "The " #4 organization chop.word sortify }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+FUNCTION {editor.organization.sort}
+{ editor empty$
+    { organization empty$
+ { key empty$
+     { "to sort, need editor, organization, or key in " cite$ * warning$
+       ""
+     }
+     { key sortify }
+   if$
+ }
+ { "The " #4 organization chop.word sortify }
+      if$
+    }
+    { editor sort.format.names }
+  if$
+}
+FUNCTION {presort}
+{ type$ "book" =
+  type$ "inbook" =
+  or
+    'author.editor.sort
+    { type$ "proceedings" =
+ 'editor.organization.sort
+ { type$ "manual" =
+     'author.organization.sort
+     'author.sort
+   if$
+ }
+      if$
+    }
+  if$
+  "    "
+  *
+  year field.or.null sortify
+  *
+  "    "
+  *
+  title field.or.null
+  sort.format.title
+  *
+  #1 entry.max$ substring$
+  'sort.key$ :=
+}
+ITERATE {presort}
+SORT
+STRINGS { longest.label }
+INTEGERS { number.label longest.label.width }
+FUNCTION {initialize.longest.label}
+{ "" 'longest.label :=
+  #1 'number.label :=
+  #0 'longest.label.width :=
+}
+FUNCTION {longest.label.pass}
+{ number.label int.to.str$ 'label :=
+  number.label #1 + 'number.label :=
+  label width$ longest.label.width >
+    { label 'longest.label :=
+      label width$ 'longest.label.width :=
+    }
+    'skip$
+  if$
+}
+EXECUTE {initialize.longest.label}
+ITERATE {longest.label.pass}
+FUNCTION {begin.bib}
+{ preamble$ empty$
+    'skip$
+    { preamble$ write$ newline$ }
+  if$
+  "\begin{thebibliography}{"  longest.label  * "}" *
+  "\itemsep=-1pt" * % Compact the entries a little.
+  write$ newline$
+}
+EXECUTE {begin.bib}
+EXECUTE {init.state.consts}
+ITERATE {call.type$}
+FUNCTION {end.bib}
+{ newline$
+  "\end{thebibliography}" write$ newline$
+}
+EXECUTE {end.bib}
+% end of file ieee.bst
+% ---------------------------------------------------------------

paper-tex/ms.bbl ADDED Viewed

	@@ -0,0 +1,337 @@

+\begin{thebibliography}{10}\itemsep=-1pt
+\bibitem{Pixel6as49}
+Pixel 6, a smarter chip for a smarter phone - google store.
+\newblock \url{https://store.google.com/product/pixel_6?hl=en-US}.
+\newblock (Accessed on 11/14/2021).
+\bibitem{ttpla}
+Rabab Abdelfattah, Xiaofeng Wang, and Song Wang.
+\newblock Ttpla: An aerial-image dataset for detection and segmentation of
+  transmission towers and power lines.
+\newblock In {\em Proceedings of the Asian Conference on Computer Vision},
+  2020.
+\bibitem{barnes2009patchmatch}
+Connelly Barnes, Eli Shechtman, Adam Finkelstein, and Dan~B Goldman.
+\newblock Patchmatch: A randomized correspondence algorithm for structural
+  image editing.
+\newblock {\em ACM Trans. Graph.}, 28(3):24, 2009.
+\bibitem{deeplab}
+Liang-Chieh Chen, George Papandreou, Iasonas Kokkinos, Kevin Murphy, and Alan~L
+  Yuille.
+\newblock Deeplab: Semantic image segmentation with deep convolutional nets,
+  atrous convolution, and fully connected crfs.
+\newblock {\em IEEE transactions on pattern analysis and machine intelligence},
+  40(4):834--848, 2017.
+\bibitem{deeplabv3}
+Liang-Chieh Chen, George Papandreou, Florian Schroff, and Hartwig Adam.
+\newblock Rethinking atrous convolution for semantic image segmentation.
+\newblock {\em arXiv preprint arXiv:1706.05587}, 2017.
+\bibitem{deeplabv3p}
+Liang-Chieh Chen, Yukun Zhu, George Papandreou, Florian Schroff, and Hartwig
+  Adam.
+\newblock Encoder-decoder with atrous separable convolution for semantic image
+  segmentation.
+\newblock In {\em Proceedings of the European conference on computer vision
+  (ECCV)}, pages 801--818, 2018.
+\bibitem{glnet}
+Wuyang Chen, Ziyu Jiang, Zhangyang Wang, Kexin Cui, and Xiaoning Qian.
+\newblock Collaborative global-local networks for memory-efficient segmentation
+  of ultra-high resolution images.
+\newblock In {\em Proceedings of the IEEE/CVF Conference on Computer Vision and
+  Pattern Recognition}, pages 8924--8933, 2019.
+\bibitem{cascadepsp}
+Ho~Kei Cheng, Jihoon Chung, Yu-Wing Tai, and Chi-Keung Tang.
+\newblock Cascadepsp: toward class-agnostic and very high-resolution
+  segmentation via global and local refinement.
+\newblock In {\em Proceedings of the IEEE/CVF Conference on Computer Vision and
+  Pattern Recognition}, pages 8890--8899, 2020.
+\bibitem{darabi2012image}
+Soheil Darabi, Eli Shechtman, Connelly Barnes, Dan~B Goldman, and Pradeep Sen.
+\newblock Image melding: Combining inconsistent images using patch-based
+  synthesis.
+\newblock {\em ACM Transactions on graphics (TOG)}, 31(4):1--10, 2012.
+\bibitem{isdnet}
+Shaohua Guo, Liang Liu, Zhenye Gan, Yabiao Wang, Wuhao Zhang, Chengjie Wang,
+  Guannan Jiang, Wei Zhang, Ran Yi, Lizhuang Ma, et~al.
+\newblock Isdnet: Integrating shallow and deep networks for efficient
+  ultra-high resolution segmentation.
+\newblock In {\em Proceedings of the IEEE/CVF Conference on Computer Vision and
+  Pattern Recognition}, pages 4361--4370, 2022.
+\bibitem{hassani2022dilated}
+Ali Hassani and Humphrey Shi.
+\newblock Dilated neighborhood attention transformer.
+\newblock 2022.
+\bibitem{hassani2022neighborhood}
+Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
+\newblock Neighborhood attention transformer.
+\newblock 2022.
+\bibitem{resnet}
+Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun.
+\newblock Deep residual learning for image recognition.
+\newblock In {\em Proceedings of the IEEE conference on computer vision and
+  pattern recognition}, pages 770--778, 2016.
+\bibitem{ccnet}
+Zilong Huang, Xinggang Wang, Lichao Huang, Chang Huang, Yunchao Wei, and Wenyu
+  Liu.
+\newblock Ccnet: Criss-cross attention for semantic segmentation.
+\newblock In {\em Proceedings of the IEEE/CVF International Conference on
+  Computer Vision}, pages 603--612, 2019.
+\bibitem{magnet}
+Chuong Huynh, Anh~Tuan Tran, Khoa Luu, and Minh Hoai.
+\newblock Progressive semantic segmentation.
+\newblock In {\em Proceedings of the IEEE/CVF Conference on Computer Vision and
+  Pattern Recognition}, pages 16755--16764, 2021.
+\bibitem{globallocal}
+Satoshi Iizuka, Edgar Simo-Serra, and Hiroshi Ishikawa.
+\newblock Globally and locally consistent image completion.
+\newblock {\em ACM Transactions on Graphics (ToG)}, 36(4):1--14, 2017.
+\bibitem{jain2022oneformer}
+Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, and Humphrey
+  Shi.
+\newblock {OneFormer: One Transformer to Rule Universal Image Segmentation}.
+\newblock 2023.
+\bibitem{jain2021semask}
+Jitesh Jain, Anukriti Singh, Nikita Orlov, Zilong Huang, Jiachen Li, Steven
+  Walton, and Humphrey Shi.
+\newblock Semask: Semantically masking transformer backbones for effective
+  semantic segmentation.
+\newblock {\em arXiv}, 2021.
+\bibitem{jain2022keys}
+Jitesh Jain, Yuqian Zhou, Ning Yu, and Humphrey Shi.
+\newblock Keys to better image inpainting: Structure and texture go hand in
+  hand.
+\newblock {\em arXiv preprint arXiv:2208.03382}, 2022.
+\bibitem{swiftlane}
+Oshada Jayasinghe, Damith Anhettigama, Sahan Hemachandra, Shenali Kariyawasam,
+  Ranga Rodrigo, and Peshala Jayasekara.
+\newblock Swiftlane: Towards fast and efficient lane detection.
+\newblock {\em arXiv preprint arXiv:2110.11779}, 2021.
+\bibitem{karras2020analyzing}
+Tero Karras, Samuli Laine, Miika Aittala, Janne Hellsten, Jaakko Lehtinen, and
+  Timo Aila.
+\newblock Analyzing and improving the image quality of stylegan.
+\newblock In {\em Proceedings of the IEEE/CVF conference on computer vision and
+  pattern recognition}, pages 8110--8119, 2020.
+\bibitem{kaspar2015self}
+Alexandre Kaspar, Boris Neubert, Dani Lischinski, Mark Pauly, and Johannes
+  Kopf.
+\newblock Self tuning texture optimization.
+\newblock In {\em Computer Graphics Forum}, volume~34, pages 349--359. Wiley
+  Online Library, 2015.
+\bibitem{cable_inst}
+Bo Li, Cheng Chen, Shiwen Dong, and Junfeng Qiao.
+\newblock Transmission line detection in aerial images: An instance
+  segmentation approach based on multitask neural networks.
+\newblock {\em Signal Processing: Image Communication}, 96:116278, 2021.
+\bibitem{focal}
+Tsung-Yi Lin, Priya Goyal, Ross Girshick, Kaiming He, and Piotr Doll{\'a}r.
+\newblock Focal loss for dense object detection.
+\newblock In {\em Proceedings of the IEEE international conference on computer
+  vision}, pages 2980--2988, 2017.
+\bibitem{partialconv}
+Guilin Liu, Fitsum~A Reda, Kevin~J Shih, Ting-Chun Wang, Andrew Tao, and Bryan
+  Catanzaro.
+\newblock Image inpainting for irregular holes using partial convolutions.
+\newblock In {\em Proceedings of the European conference on computer vision
+  (ECCV)}, pages 85--100, 2018.
+\bibitem{swin}
+Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, and
+  Baining Guo.
+\newblock Swin transformer: Hierarchical vision transformer using shifted
+  windows.
+\newblock {\em arXiv preprint arXiv:2103.14030}, 2021.
+\bibitem{adamw}
+Ilya Loshchilov and Frank Hutter.
+\newblock Decoupled weight decay regularization.
+\newblock {\em arXiv preprint arXiv:1711.05101}, 2017.
+\bibitem{lsnet}
+Van~Nhan Nguyen, Robert Jenssen, and Davide Roverso.
+\newblock Ls-net: Fast single-shot line-segment detector.
+\newblock {\em arXiv preprint arXiv:1912.09532}, 2019.
+\bibitem{contextencoder}
+Deepak Pathak, Philipp Krahenbuhl, Jeff Donahue, Trevor Darrell, and Alexei~A
+  Efros.
+\newblock Context encoders: Feature learning by inpainting.
+\newblock In {\em Proceedings of the IEEE conference on computer vision and
+  pattern recognition}, pages 2536--2544, 2016.
+\bibitem{dalle}
+Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen.
+\newblock Hierarchical text-conditional image generation with clip latents.
+\newblock {\em arXiv preprint arXiv:2204.06125}, 2022.
+\bibitem{dpt}
+Ren{\'e} Ranftl, Alexey Bochkovskiy, and Vladlen Koltun.
+\newblock Vision transformers for dense prediction.
+\newblock In {\em Proceedings of the IEEE/CVF International Conference on
+  Computer Vision}, pages 12179--12188, 2021.
+\bibitem{rombach2022high}
+Robin Rombach, Andreas Blattmann, Dominik Lorenz, Patrick Esser, and Bj{\"o}rn
+  Ommer.
+\newblock High-resolution image synthesis with latent diffusion models.
+\newblock In {\em Proceedings of the IEEE/CVF Conference on Computer Vision and
+  Pattern Recognition}, pages 10684--10695, 2022.
+\bibitem{ohem}
+Abhinav Shrivastava, Abhinav Gupta, and Ross Girshick.
+\newblock Training region-based object detectors with online hard example
+  mining.
+\newblock In {\em Proceedings of the IEEE conference on computer vision and
+  pattern recognition}, pages 761--769, 2016.
+\bibitem{structurelane}
+Jinming Su, Chao Chen, Ke Zhang, Junfeng Luo, Xiaoming Wei, and Xiaolin Wei.
+\newblock Structure guided lane detection.
+\newblock {\em arXiv preprint arXiv:2105.05403}, 2021.
+\bibitem{suvorov2022resolution}
+Roman Suvorov, Elizaveta Logacheva, Anton Mashikhin, Anastasia Remizova,
+  Arsenii Ashukha, Aleksei Silvestrov, Naejin Kong, Harshith Goka, Kiwoong
+  Park, and Victor Lempitsky.
+\newblock Resolution-robust large mask inpainting with fourier convolutions.
+\newblock In {\em Proceedings of the IEEE/CVF Winter Conference on Applications
+  of Computer Vision}, pages 2149--2159, 2022.
+\bibitem{lanedet}
+Lucas Tabelini, Rodrigo Berriel, Thiago~M Paixao, Claudine Badue, Alberto~F
+  De~Souza, and Thiago Oliveira-Santos.
+\newblock Keep your eyes on the lane: Real-time attention-guided lane
+  detection.
+\newblock In {\em Proceedings of the IEEE/CVF Conference on Computer Vision and
+  Pattern Recognition}, pages 294--302, 2021.
+\bibitem{attention}
+Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones,
+  Aidan~N Gomez, {\L}ukasz Kaiser, and Illia Polosukhin.
+\newblock Attention is all you need.
+\newblock In {\em Advances in neural information processing systems}, pages
+  5998--6008, 2017.
+\bibitem{wexler2007space}
+Yonatan Wexler, Eli Shechtman, and Michal Irani.
+\newblock Space-time completion of video.
+\newblock {\em IEEE Transactions on pattern analysis and machine intelligence},
+  29(3):463--476, 2007.
+\bibitem{segformer}
+Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose~M Alvarez, and Ping
+  Luo.
+\newblock Segformer: Simple and efficient design for semantic segmentation with
+  transformers.
+\newblock {\em arXiv preprint arXiv:2105.15203}, 2021.
+\bibitem{xu2022image}
+Xingqian Xu, Shant Navasardyan, Vahram Tadevosyan, Andranik Sargsyan, Yadong
+  Mu, and Humphrey Shi.
+\newblock Image completion with heterogeneously filtered spectral hints.
+\newblock In {\em WACV}, 2023.
+\bibitem{powerlinedataset}
+{\"O}mer~Emre Yetgin, {\"O}mer~Nezih Gerek, and {\"O}mer Nezih.
+\newblock Power image dataset (infrared-ir and visible light-vl).
+\newblock {\em Mendeley Data}, 8, 2017.
+\bibitem{hifill}
+Zili Yi, Qiang Tang, Shekoofeh Azizi, Daesik Jang, and Zhan Xu.
+\newblock Contextual residual aggregation for ultra high-resolution image
+  inpainting.
+\newblock In {\em Proceedings of the IEEE/CVF Conference on Computer Vision and
+  Pattern Recognition}, pages 7508--7517, 2020.
+\bibitem{contextual}
+Jiahui Yu, Zhe Lin, Jimei Yang, Xiaohui Shen, Xin Lu, and Thomas~S Huang.
+\newblock Generative image inpainting with contextual attention.
+\newblock In {\em Proceedings of the IEEE conference on computer vision and
+  pattern recognition}, pages 5505--5514, 2018.
+\bibitem{yu2019free}
+Jiahui Yu, Zhe Lin, Jimei Yang, Xiaohui Shen, Xin Lu, and Thomas~S Huang.
+\newblock Free-form image inpainting with gated convolution.
+\newblock In {\em Proceedings of the IEEE/CVF international conference on
+  computer vision}, pages 4471--4480, 2019.
+\bibitem{zeng2020high}
+Yu Zeng, Zhe Lin, Jimei Yang, Jianming Zhang, Eli Shechtman, and Huchuan Lu.
+\newblock High-resolution image inpainting with iterative confidence feedback
+  and guided upsampling.
+\newblock In {\em European conference on computer vision}, pages 1--17.
+  Springer, 2020.
+\bibitem{pldu}
+Heng Zhang, Wen Yang, Huai Yu, Haijian Zhang, and Gui-Song Xia.
+\newblock Detecting power lines in uav images with convolutional features and
+  structured constraints.
+\newblock {\em Remote Sensing}, 11(11):1342, 2019.
+\bibitem{supercaf}
+Lingzhi Zhang, Connelly Barnes, Kevin Wampler, Sohrab Amirghodsi, Eli
+  Shechtman, Zhe Lin, and Jianbo Shi.
+\newblock Inpainting at modern camera resolution by guided patchmatch with
+  auto-curation.
+\newblock In {\em European Conference on Computer Vision}, pages 51--67.
+  Springer, 2022.
+\bibitem{pspnet}
+Hengshuang Zhao, Jianping Shi, Xiaojuan Qi, Xiaogang Wang, and Jiaya Jia.
+\newblock Pyramid scene parsing network.
+\newblock In {\em Proceedings of the IEEE conference on computer vision and
+  pattern recognition}, pages 2881--2890, 2017.
+\bibitem{comodgan}
+Shengyu Zhao, Jonathan Cui, Yilun Sheng, Yue Dong, Xiao Liang, Eric~I Chang,
+  and Yan Xu.
+\newblock Large scale image completion via co-modulated generative adversarial
+  networks.
+\newblock {\em arXiv preprint arXiv:2103.10428}, 2021.
+\bibitem{zheng2022cm}
+Haitian Zheng, Zhe Lin, Jingwan Lu, Scott Cohen, Eli Shechtman, Connelly
+  Barnes, Jianming Zhang, Ning Xu, Sohrab Amirghodsi, and Jiebo Luo.
+\newblock Cm-gan: Image inpainting with cascaded modulation gan and
+  object-aware training.
+\newblock {\em arXiv preprint arXiv:2203.11947}, 2022.
+\bibitem{setr}
+Sixiao Zheng, Jiachen Lu, Hengshuang Zhao, Xiatian Zhu, Zekun Luo, Yabiao Wang,
+  Yanwei Fu, Jianfeng Feng, Tao Xiang, Philip~HS Torr, et~al.
+\newblock Rethinking semantic segmentation from a sequence-to-sequence
+  perspective with transformers.
+\newblock In {\em Proceedings of the IEEE/CVF Conference on Computer Vision and
+  Pattern Recognition}, pages 6881--6890, 2021.
+\bibitem{zhou2017places}
+Bolei Zhou, Agata Lapedriza, Aditya Khosla, Aude Oliva, and Antonio Torralba.
+\newblock Places: A 10 million image database for scene recognition.
+\newblock {\em IEEE Transactions on Pattern Analysis and Machine Intelligence},
+  2017.
+\end{thebibliography}

paper-tex/ms.tex ADDED Viewed

	@@ -0,0 +1,100 @@

+% CVPR 2023 Paper Template
+% based on the CVPR template provided by Ming-Ming Cheng (https://github.com/MCG-NKU/CVPR_Template)
+% modified and extended by Stefan Roth ([email protected])
+\documentclass[10pt,twocolumn,letterpaper]{article}
+%%%%%%%%% PAPER TYPE  - PLEASE UPDATE FOR FINAL VERSION
+% \usepackage[review]{cvpr}      % To produce the REVIEW version
+\usepackage{cvpr}              % To produce the CAMERA-READY version
+%\usepackage[pagenumbers]{cvpr} % To force page numbers, e.g. for an arXiv version
+\usepackage[accsupp]{axessibility}  % Improves PDF readability for those with disabilities.
+% Include other packages here, before hyperref.
+\usepackage[normalem]{ulem}
+\usepackage{graphicx}
+\usepackage{amsmath}
+\usepackage{amssymb}
+\usepackage{booktabs}
+\usepackage{xcolor}
+\usepackage{comment}
+\usepackage{enumitem}
+\newcommand{\todo}[1]{{\color{red}#1}}
+\newcommand{\benchmark}{WireSegHR}
+% It is strongly recommended to use hyperref, especially for the review version.
+% hyperref with option pagebackref eases the reviewers' job.
+% Please disable hyperref *only* if you encounter grave issues, e.g. with the
+% file validation for the camera-ready version.
+%
+% If you comment hyperref and then uncomment it, you should delete
+% ReviewTempalte.aux before re-running LaTeX.
+% (Or just hit 'q' on the first LaTeX run, let it finish, and you
+%  should be clear).
+\usepackage[pagebackref,breaklinks,colorlinks]{hyperref}
+% Support for easy cross-referencing
+\usepackage[capitalize]{cleveref}
+\crefname{section}{Sec.}{Secs.}
+\Crefname{section}{Section}{Sections}
+\Crefname{table}{Table}{Tables}
+\crefname{table}{Tab.}{Tabs.}
+%%%%%%%%% PAPER ID  - PLEASE UPDATE
+\def\cvprPaperID{699} % *** Enter the CVPR Paper ID here
+\def\confName{CVPR}
+\def\confYear{2023}
+\begin{document}
+%%%%%%%%% TITLE - PLEASE UPDATE
+% \title{Automatic Wire Segmentation for Inpainting}
+% \title{Segmentation to the Extreme: \\A Large-Scale Wire Segmentation Dataset and a Pilot Study}
+%\title{Segmentation to the Extreme: \\A Large-Scale High-Resolution Wire Segmentation Dataset and a Pilot Study}
+%\title{WINE: WIre NEver Appeared in Your Photos}
+\title{Automatic High Resolution Wire Segmentation and Removal}
+\author{Mang Tik Chiu$^{1,2}$, Xuaner Zhang$^2$, Zijun Wei$^{2}$, Yuqian Zhou$^2$, Eli Shechtman$^2$, \\Connelly Barnes$^{2}$, Zhe Lin$^2$, Florian Kainz$^2$, Sohrab Amirghodsi$^2$, Humphrey Shi$^{1,3}$
+\\
+{\small $^1$UIUC, $^2$Adobe, $^3$University of Oregon}\\
+{\small \textbf{\url{https://github.com/adobe-research/auto-wire-removal}}}}
+\twocolumn[{%
+\renewcommand\twocolumn[1][]{#1}%
+\maketitle
+\begin{center}
+    \centering
+    \captionsetup{type=figure}
+    \includegraphics[width=\textwidth]{figures/teaser2.pdf}
+    \captionof{figure}{We present an automatic high-resolution wire segmentation and removal pipeline. Each triad shows the high-resolution input image, our automatic wire segmentation result masked in red, and our full-resolution wire removal result. The visual quality of these photographs is greatly improved with our fully-automated wire clean-up system.}
+    \label{fig:demo}
+\end{center}%
+}]
+% \vspace{3mm}
+%[INTERNAL]: \href{https://session-0f3f6c51-53bd-446a-ad30-ea59ed8b6445.devbox.training.adobesensei.io/tensorboard/result.html}{Visualizations} of current models.
+%%%%%%%%% ABSTRACT
+% \input{figure_tex/teaser}
+\input{sections/abstract}
+%%%%%%%%% BODY TEXT
+\input{sections/introduction}
+\input{sections/related_work}
+\input{sections/dataset}
+\input{sections/method_yq}
+\input{sections/results}
+\input{sections/discussion}
+\input{sections/conclusion}
+%%%%%%%%% REFERENCES
+{\small
+\bibliographystyle{ieee_fullname}
+\bibliography{egbib}
+}
+\end{document}

paper-tex/sections/abstract.tex ADDED Viewed

	@@ -0,0 +1,23 @@

+%auto-ignore
+\begin{abstract}
+\vspace{-5.5mm}
+Wires and powerlines are common visual distractions that often undermine the aesthetics of photographs. The manual process of precisely segmenting and removing them is extremely tedious and may take up hours, especially on high-resolution photos where wires may span the entire space. In this paper, we present an automatic wire clean-up system that eases the process of wire segmentation and removal/inpainting to within a few seconds.
+We observe several unique challenges: wires are thin, lengthy, and sparse. These are rare properties of subjects that common segmentation tasks cannot handle, especially in high-resolution images.
+We thus propose a two-stage method that leverages both global and local contexts to accurately segment wires in high-resolution images efficiently, and a tile-based inpainting strategy to remove the wires given our predicted segmentation masks. We also introduce the first wire segmentation benchmark dataset, \benchmark. Finally, we demonstrate quantitatively and qualitatively that our wire clean-up system enables fully automated wire removal with great generalization to various wire appearances.
+%We introduce the novel problem of semantic segmentation for wire-like objects in high-resolution images for photo retouching applications.
+%We observe several challenging properties of wire appearances that are rare in most segmentation datasets -- thin, lengthy, and sparse, which recent works in high-resolution semantic segmentation cannot effectively solve. We thus propose a two-stage model that leverages global context and local information to predict accurate wire masks, and design an inference pipeline that efficiently handles high-resolution images.
+%To encourage future research, we introduce WireSeg-HR, the first benchmark dataset on wire-like object semantic segmentation for photographic applications. Finally, we show our wire segmentation pipeline enables fully automated wire removal for photo retouching.
+% , and show that our model outperforms baseline methods.
+% In the end, we demonstrate that with our predicted wire masks, wire-like object removal, a common while tedious photo retouching step, can be made fully automated with high quality results.
+% We introduce the novel problem of semantic segmentation for wire-like objects in high-resolution images for photo retouching applications.
+% We observe several challenging properties of wire appearances that are rare in most segmentation datasets -- thin, extensive, sparse and discontinuous (due to frequent occlusions). We provide comprehensive analyses on the effect of common semantic segmentation approaches over these wire-like objects and propose a two-stage model to overcome these difficulties.
+% To encourage research on tackling challenging properties of wires, we introduce WireSeg-HR, the first benchmark dataset on wire-like object semantic segmentation for photographic applications, and show that our model outperforms baseline methods.
+% % In the end, we demonstrate that with our predicted wire masks, wire-like object removal, a common while tedious photo retouching step, can be made fully automated with high quality results.
+\end{abstract}
+% \vspace{-1.343cm}

paper-tex/sections/conclusion.tex ADDED Viewed

	@@ -0,0 +1,8 @@

+%auto-ignore
+\section{Conclusion}
+In this paper, we propose a fully automated wire segmentation and removal system for high-resolution imagery. We demonstrate a segmentation method that maximally preserves sparse wire features and annotations, with a two-stage model that effectively uses global context and local details. The predicted segmentation mask is used in our tile-based wire inpainting model that has been demonstrated to produce seamless inpainting results.
+We also introduce \benchmark, the first benchmark wire dataset with high-quality annotations. We hope our proposed method will provide insights into tackling semantic segmentation with high resolution image and annotation properties, and that our benchmark dataset encourage further research in wire segmentation and removal.
+%\todo{ending}
+% Our wire segmentation and removal system method is a key component in automatic wire removal, and encourages research in recognizing associated wire entities such as its shadows and reflections. Our work will also motivate research in efficient inpainting algorithms with a focus on challenging appearances of wires.

paper-tex/sections/dataset.tex ADDED Viewed

	@@ -0,0 +1,32 @@

+%auto-ignore
+\section{Dataset Collection and \benchmark} \label{sec:dataset}
+% We present
+%We will release the test set (407 images?) to provide a standardized platform for benchmarking. \yq{Should we mention here we only release the testing dataset, or just do not mention now whether we want to release the training one.}
+%In this section, we describe the unique properties and provide statistics of WireSeg-HR, a high-resolution photographic image benchmark dataset for wire semantic segmentation.
+\subsection{Image Source and Annotations}
+% We collected high-resolution images with wires and wire-like objects from multiple sources. 20\% of the images are taken with DSLRs and smartphone cameras, while the others are sourced from the Internet (e.g. Flickr). As a result, the images have a variety of resolutions and have gone through different image processing pipelines. We target images with scenes that include city, street, rural area and landscape. We restricted the number of images in which the wire is the major object. We manually filter images so that wires in these images appear differently, for example, in front of buildings, partially occluded by vegetation or be hardly visible in the sky. This leads to a rich set of wire appearances. The final dataset after manual selection contains 3201 images.
+Our definition of wires include electrical wires/cables, power lines, supporting/connecting wires, and any wire-like object that resemble a wire structure.
+We collect high-resolution images with wires from two sources: 80\% of the images are from photo sharing platforms (Flickr, Pixabay, etc.), and 20\% of the images are captured with different cameras (DSLRs and smartphones) in multiple countries on our own. For the internet images, we collect 400K candidate images by keyword-searching. Then, we remove duplicates and images where wires are the only subjects. We then curate the final 6K images that cover sufficient scene diversity like city, street, rural area and landscape.
+%\subsection{Wire annotation}
+\input{figure_tex/annotation}
+\input{figure_tex/pipeline}
+Our wire annotation process contains two rounds. In the first round, annotators draw detailed masks over wires at full-resolution. The annotated masks enclose the main wire body and the boundary, oftentimes including a gradient falloff due to aliasing or defocus.
+The boundary region annotation is crucial so as to avoid residual artifacts during wire removal.
+In the second round, quality assurance is carried out to re-annotate unsatisfactory annotations. We show an example of our high-quality wire annotations in Figure~\ref{fig:annotation}.
+% As shown, our dataset contains highly detailed masks for wires of various shapes and appearances.
+\subsection{Dataset Statistics}
+%\yq{Do we want to release the entire dataset. It's better to have a table comparing the statistics of different exisiting datasets and show the special cases we covered but others do not. }\cezhang{agreed, we should also mention and justify that this is the first wire dataset, unlike previous datasets such as the one for line detection or for satellite images, or any other available ones.}
+In Table~\ref{table:stats}, we list the statistics of our dataset and compare them with existing wire-like datasets. Our dataset is the first wire dataset that contains high-resolution photographic images. The dataset is randomly split into 5000 training, 500 validation, and 500 testing images. We release 420 copyright-free test images with annotations.
+%We will release the 500 test images (the \benchmark) with annotations upon paper acceptance.
+%We will release \todo{how many?} test set as the first wire segmentation benchmark dataset.
+\input{tables/stats}

paper-tex/sections/discussion.tex ADDED Viewed

	@@ -0,0 +1,43 @@

+%auto-ignore
+\section{Discussion}
+% \subsection{Wire completion}
+% In some cases, a part of a wire goes in front of a textured object and becomes difficult to recognize. When this happens, a light post-processing step can be performed to connect two wire segments and form a complete wire. We achieve this by fitting a parabola on each predicted wire segment. Pixels along the fitted curve that are originally predicted as background with a wire confidence of above 0.4 are reclassified as wire. We show in Figure~\ref{fig:completion} how this simple post-processing method can help picking up any missed wires due to blending with backgrounds.
+\begin{comment}
+\subsection{Wire removal}
+%\input{figure_tex/removal}
+Figure~\ref{fig:pixel6} (B) show two examples of automatic wire removal using wire segmentation masks from our proposed model. We use the ``Content-aware Fill''~\cite{barnes2009patchmatch} feature in Photoshop to remove wires. The full process of wire segmentation and removal is fully automatic without the need of any user interaction.
+\end{comment}
+% \subsection{Generalizing to large images}
+% \input{figure_tex/panorama}
+% Given the fact that wires are sparse in natural images, we show that our two-stage model generalizes to ultra-high resolution images such as panoramas without significant computation overhead. A $10$K by $2$K panorama with its segmentation result is shown in Figure~\ref{fig:panorama}. The full inference takes $6.7$ seconds and produces high-quality wire segmentation.
+\vspace{-1.5mm}
+\subsection{Comparison with Google Pixel 6}
+\vspace{-1.5mm}
+% \input{figure_tex/pixel6}
+Recently, Google Pixel 6~\cite{Pixel6as49} announced the ``Magic Eraser'' photo feature that automatically detects and removes distractors. Note that this is a product feature and is not specifically designed for wires, and thus is hardly comparable with our method. We compare against this feature by uploading the images to Google Photos and applying ``Magic Eraser'' without manual intervention. We find that ``Magic Eraser'' performs well on wires with clear background, but it suffers from thin wires that are hardly visible and wires with complicated background. We show two examples in the supplementary material.
+\subsection{Failure cases}
+\vspace*{-2.5mm}
+% \input{figure_tex/failure}
+%\cezhang{todo: fill in caption for figure and revise this paragraph}
+%Figure~\ref{fig:failure} shows examples where our model does not perform well. In the upper row example, the diagonal and vertical wires are ambiguously blended with the background texture, which make the wires hardly distinguishable. In the lower row case, thicker wires with complex structures rarely appear in the dataset, causing predictions of only part of the wire.
+While our proposed wire segmentation model produces high-quality masks in most situations, there are still some challenging cases that our model cannot resolve. In particular, wires that are heavily blended in with surrounding structures/background, or wires under extreme lighting conditions are challenging to segment accurately. We show several examples in the supplementary material.
+\input{tables/component}
+\input{tables/thresholds}
+\input{tables/inpaint}
+\input{figure_tex/inpaint_compare}
+% \subsection{Potential negative impacts}
+% While our work is primarily used in image retouching applications, improper use of our wire segmentation model and content removal may spread misinformation for malicious intents. Content authentication could be used to mark the integrity of the shared content. Caution must be taken on the potential negative social impacts that come with the application of this work.
+% \vspace{-2mm}

paper-tex/sections/introduction.tex ADDED Viewed

	@@ -0,0 +1,86 @@

+%auto-ignore
+\section{Introduction}
+% Distractor removal is a common step in photo editing. Conventionally, to remove any unwanted objects from an image, the user first draws a mask over the object, then uses a hole-filling algorithm to remove them and fill in the background. With recent advance in semantic segmentation, automatic masking of some common objects is possible. Among the types of distractors, wires are one of the most common yet unique objects to remove in photographs. Their ubiquity and unique attributes calls for extra attention and importance to perform automatic wire segmentation.
+Oftentimes wire-like objects such as powerlines and cables can cross the full width of an image and ruin an otherwise beautiful composition. Removing these ``distractors'' is thus an essential step in photo retouching to improve the visual quality of a photograph. Conventionally, removing a wire-like object requires two steps: 1) segmenting out the wire-like object, and 2) removing the selected wire and inpainting with plausible contents. Both steps, if done manually, are extremely tedious and error-prone, especially for high-resolution photographs that may take photographers up to hours to reach a high-quality retouching result.
+%The second step has benefited from existing image inpainting techniques (e.g. Content-aware Fill~\cite{barnes2009patchmatch} in Photoshop) and can be accomplished in an automatic way. The first step -- wire mask creation -- is much more manually intensive and error-prone, and automatic methods have yet to be fully explored.
+% In this paper, we address the problem of wire masking as a way to automate the step of identifying and segmenting out those objects.
+In this paper, we explore a fully-automated wire segmentation and inpainting solution for wire-like object segmentation and removal with tailored model architecture and data processing.
+For simplicity, we use \textit{wire} to refer to all wire-like objects, including powerlines, cables, supporting/connecting wires, and objects with wire-like shapes.
+Wire semantic segmentation has a seemingly similar problem setup with generic semantic segmentation tasks; they both take in a high-resolution image and generate dense predictions at a pixel level.
+However, wire semantic segmentation bears a number of unique challenges. First, wires are commonly long and thin, oftentimes spanning the entire image yet having a diameter of only a handful of pixels. A few examples are shown in Figure~\ref{fig:motivation}. This prevents us from getting a precise mask based on regions of interest. Second, the input images can have arbitrarily high resolution up to 10k$\times$10k pixels for photographic retouching applications. Downsampling such high-resolution images can easily cause the thin wire structures to disappear. This poses a trade-off between preserving image size for inference quality and run-time efficiency.
+Third, while wires have regular parabolic shapes, they are often partially occluded and can reappear at arbitrary image location, thus not continuous.
+%This limits the use of parametric modeling
+(e.g.~\cite{lanedet,swiftlane}).
+%Segmentation predictions on downsampled images capture fewer details, whereas inference on high-resolution images requires quadratically more GPU memory that may be too big to fit into a modern GPU.
+% We also note that recent semantic segmentation methods that specifically target high-resolution images also have their limitations when applied to wire images.
+% We summarize the challenges of wires in Figure~\ref{fig:motivation}.
+To account for these challenges, we propose a system for automatic wire semantic segmentation and removal. For segmentation, we design a two-stage coarse-to-fine model that leverages both pixel-level details in local patches and global semantics from the full image content, and runs efficiently at inference time. For inpainting, we adopt an efficient network architecture~\cite{suvorov2022resolution}, which enables us to use a tile-based approach to handle arbitrary high resolution. We design a training strategy to enforce color consistency between the inpainted region and the original image.
+We also present the first benchmark dataset, \benchmark, for wire semantic segmentation tasks, where we collect and annotate high-resolution images with diverse scene contents and wire appearances.
+We provide analyses and baseline comparisons to justify our design choices, which include data collection, augmentation, and our two-stage model design. Together, these design choices help us overcome the unique challenges of accurately segmenting wires.
+% We report our model performance and efficiency on this benchmark dataset, and provide a set of comparisons with popular semantic segmentation methods on this task.
+Our contributions are as follows:
+% zwei{reordered the items}
+\begin{itemize}[noitemsep]
+  %\item We introduce a novel task: wire semantic segmentation for high-resolution photographic images, an under-explored task that is extremely important for applications like photo retouching.
+  %\item We introduce a wire semantic segmentation benchmark dataset that consists of photographic images at high resolution, with large scene diversity, various types of cameras, a rich collection of wire shapes and high-definite manual annotations.
+  \item \textbf{Wire segmentation model:} We propose a two-stage model for wire semantic segmentation that leverages global context and local information to predict accurate wire masks at high resolution. We design an inference pipeline that can efficiently handle ultra-high resolution images.
+  \item \textbf{Wire inpainting strategy:} We design a tile-based inpainting strategy and tailor the inpainting method for our wire removal task given our segmentation results.
+  \item \textbf{\benchmark, a benchmark dataset:} We collect a wire segmentation benchmark dataset that consists of high resolution images, with diversity in wire shapes and scene contents. We also release the manual annotations that have been carefully curated to serve as ground truths. Besides, we also propose a benchmark dataset to evaluate inpainting quality.
+  %\yq{Explain the most important difference between ours and previous existing datasets like diversity of camera viewpoints or wire width etc. (mt:changed)}
+ %\yq{What is the advantage of the newly proposed model? Will it be helpful for processing ultra-high resolution images? Is the global information used efficiently? (mt:changed)}
+  %\item We describe a set of domain-specific data augmentation and inference pipelines specifically for wire-like objects, which help overcome unique challenges in this task and improve model performance.\yq{Do we have the ablation study to show the data augmentation improves the performance? If we want to claim it as contribution, it should have some benefits and novelty in the method itself. (mt: removed)}
+\end{itemize}
+%Distractor removal is a common component in photographic enhancement. Among the various distractors, wire is one of the most common and general types. Automatically detecting and segmenting wires in an image will greatly reduce the manual masking work. However,
+% Conventionally, the user manually draws a mask over unwanted objects, then uses hole-filling algorithms to remove them. Recent advances in deep learning introduced object segmentation methods that automatically draw masks over objects of interest, thereby automating the masking process. Nevertheless,
+% there are several unique challenges in training wire segmentation models: (a) the unique shape properties of wire (thin) makes it need special treatment such as resizing and sampling. This renders the use of common object segmentation methods in this task difficult, and require task-specific designs in model and training schemes. (b) it is hard to collect data (this is the reason why we have the synthetic data. (c) there is a lack of standard dataset to benchmark the progress.
+ %\textcolor{blue}{flow: distractor removal (common operation) -> manual masking -> automating masking (segmentation), among all distractors, wire unique and common and important and general -> automatkcill segment wire is important}
+  %\textcolor{blue}{wire seg bears similar to semantic segmentation in that (a, sinput, b..), but due to the attributes (thin, large span,,,) of wire, normal segmentation is suboptimal. to ward this end, we developped xxx that addresses these issues  our models is a two stage corease file, maintains efficenty}.
+ % \textcolor{blue}{Moreover, we provide a systemactic frameworks from data data processing ,data ayucmetion to post-process}.
+ % \textcolor{blue}{realtive new field, wihtout too much data, previous benches marks (low...), we contrbute to this reserach field wtih a high quality benchmark, we also profiled a couple of metrics and discuss the performance }
+  %\textcolor{blue}{to sum up, our xxx 1, 2. 3}
+ % flow:
+%\textcolor{red}{REARANGE SECTIONS TO ONE SMOOTH STORY}
+%\subsection*{Conventional semantic segmentation}
+%In the conventional setting of semantic segmentation, the model learns to segment entities such as person, table and car. A common trait of these objects of interest is that they are somewhat regularly shaped, which means the object's width and height are similar. This property is very advantageous in terms of model training, since the object features will remain equally prominent when the image is resized without distortion. As a result, it is a common approach to downsample a large image to fit within a GPU for training and inference without losing object details.
+%\subsection*{Segmenting wires in outdoor images}
+%Outdoor wires and cables exhibit significantly different shape and lighting properties than common objects. First, many wires in outdoor images are extremely long and thin. This means that wires can often span across the entire image, while being only a few pixels thick. This poses several problems to performing inference on images, since too much downsampling would cause the wire to disappear, while too little downsampling would impose significant computation overhead. Motivated by these challenges, we propose a pipeline for wire segmentation in high-resolution images for image enhancement and provide a set of baseline in this task. Our contributions are as follows:
+%In this paper we systematically tackle the problem of wire segmentation and address issues including data collection, image pre-processing and data augmentation during training.
+% \section{Introduction}
+% Distractor removal is a common component in photographic enhancement. Conventionally, the user manually draws a mask over unwanted objects, then uses hole-filling algorithms to remove them. Recent advances in deep learning introduced object segmentation methods that automatically draw masks over objects of interest, thereby automating the masking process. Nevertheless, there are several unique challenges in wire segmentation that renders the use of common object segmentation methods in this task difficult, and require task-specific designs in model and training schemes.
+% \subsection*{Conventional semantic segmentation}
+% In the conventional setting of semantic segmentation, the model learns to segment entities such as person, table and car. A common trait of these objects of interest is that they are somewhat regularly shaped, which means the object's width and height are similar. This property is very advantageous in terms of model training, since the object features will remain equally prominent when the image is resized without distortion. As a result, it is a common approach to downsample a large image to fit within a GPU for training and inference without losing object details.
+% \subsection*{Segmenting wires in outdoor images}
+% Outdoor wires and cables exhibit significantly different shape and lighting properties than common objects. First, many wires in outdoor images are extremely long andthin. This means that wires can often span across the entire image, while being only a few pixels thick. This poses several problems to performing inference on images, since too much downsampling would cause the wire to disappear, while too little downsampling would impose significant computation overhead. Motivated by these challenges, we propose a pipeline for wire segmentation in high-resolution images for image enhancement and provide a set of baseline in this task. Our contributions are as follows:
+% \begin{itemize}
+%   \item We introduce a wire semantic segmentation dataset consisting of high-resolution images with wires.
+%   \item We propose a two-stage model for wire segmentation that captures high-resolution image features without significant computation overhead.
+%   \item We describe a set of pre-processsing and post-processing pipelines specific for wires, which helps overcome unique challenges in this task.
+% \end{itemize}

paper-tex/sections/method.tex ADDED Viewed

	@@ -0,0 +1,98 @@

+%auto-ignore
+% \textcolor{blue}{ 2 options: 1. synthetic works: sec 3. data, sec 4. method, if synthetic not working: sec, method, sec, evaluation data collection: 1. coverage (indoor, outdoor, we even handshot...: multiple sources [flicker, google, etc, our own shooting]),
+% multiple coverage (different keywords, select 400 from xxxK images, [ask mada to provde distritbuion, in supplemnetatyr]), 3. resolution: minimize. [How to convince pepople to use the test set..], annotation quality [in house  annotation, multiple rounds of quality assurance].  We xx, 409 will be release}
+% \textcolor{blue}{ if synthetic works, add another subsection in data section}
+% \subsection{Generating synthetic dataset}
+% While our dataset contains xxxx high-resolution images, the number is still insufficient when compared to other large segmentation datasets such as the well known Cityscapes~\cite{cityscapes}, Mapillary Vistas~\cite{mapillary} and MSCOCO dataset~\cite{coco}. To overcome the data insufficiency issue, one approach is to incorporate synthetic datasets with similar properties in the training. This is used in numerous vision related tasks such as 3D reconstruction~\cite{3drecon}, semantic segmentation (specifically domain adaptation)~\cite{domainadapt}. Synthetic dataset is also widely used in line and wire segmentation tasks~\cite{syntheticline, syntheticwire}, though they primarily used 3D rendering engines to generate artificial wires.
+% Contrary to their approaches, we opt to generate our synthetic dataset with existing wires by blending wires from one image to another. Our approach is motivated by the fact that wires in our dataset have very different shapes and sizes, and is therefore difficult to generate artificially. To ensure our synthetic dataset is adequately realistic, we leverage semantic information of the scene as a cue to select regions for wire blending. For example, we only blend wires from the sky of one image to the sky of another image, such that the lighting, texture and nearby entities are similar.
+% Here we describe in detail how we synthesize wire images. First, we predict a semantic map for each image in our real dataset. This gives us semantic information on the image and the wires. Then, we extract the wires from these images by cropping wire pixels and removing them from the background with Content-Aware Fill. For each obtained background image, we generate three separate images by randomly transforming (e.g. rotating and scaling) and blending several wires with the image, where the blending region is constricted by the type of object in the region and the selected wire. The resulting synthetic wire dataset contains xxxx images with different realistic wire patterns. Figure~\ref{fig:synthetic} shows an example of our generated high-resolution wire image.
+\section{Method} \label{sec:method}
+% Sec 4.1 Model structure
+% Sec 4.2 how we combine: inference: end to end , information flow, how from an image to wire segmentation masks, inference method: 1. global run -> subset regions (detailed selection will be descried in 4.4.1) [describe the whole process -> (new para) explain how selected.]
+% Sec 4.3 how we train: how we train to make usre share weights: [describe whole training, then detail each components ((new parap) pre-processing, batch training, sampling, etc)]
+% Sec.4.4 pre-processing and post-processing
+% In this section we[], the structure is in fig X
+% 4.1: subsubection{global branch}: input, output, structurte, subsubsection{local branch}: input, output , structrure
+% 4.2: combining branches, connecting, how its done to end-to-end
+% \textcolor{blue}{our model is composed of of xxx parts: a global (Secxxx), a local. Add a figure}
+\input{figure_tex/pipeline}
+In this section, we describe our two-stage model architecture. Figure~\ref{fig:pipeline} shows the overall structure of our model. We will first discuss motivations of our two-stage design, then we describe our two-stage model architecture and training scheme in detail.
+\subsection{Motivation}
+As shown in Figure~\ref{fig:motivation}, we divide our model into two major components -- a coarse module and a fine module. Our two-stage model design is motivated by two observations. To start, wires in our dataset are extremely long and thin. As mentioned above, many wires in our images can span over several thousand pixels long while only several pixels across. Limited by the memory size of GPUs, we cannot simply pass the entire image spanning thousands of pixels at full-resolution to a model for inference. As a result, two separate modules are required, where the coarse module captures the entire wire and its surrounding contextual information at a lower resolution, and the fine module captures detailed textures of the wire at the original resolution.
+Next, we observe that wire pixels are also very sparse, where a typical high-resolution image contains very small percentages of wire pixels. This means that we can also use the coarse module to guide the fine module on what regions to capture full-resolution details. This way, the fine module can predict segmentation masks only where there are wires, and therefore reduce computation time.
+\subsection{The two-stage coarse to fine model}
+Given a high-resolution image $I$, the coarse module of our two-stage model aims to recognize semantics of the entire image and pick up wire regions at a coarse level. Looking at the entire image allows the coarse module to capture more contextual information. To fit the large images into a GPU for training and inference, images are first bilinearly downsampled, then fed into the coarse module to predict a global logit map $Z_{glo}$. We use $Z_{glo}$ as a conditional input to guide the fine module in the next stage.
+The fine module is conditioned on the output from the coarse module. This module takes in a local image patch $I_{loc}$ of the whole image at full resolution, the global logit map $Z_{glo}$ from the coarse module, and a binary location mask $M$ that sets the patch relative to the full image to $1$ and other regions to $0$. The fine module then predicts the local wire logit map $Z_{loc}$. Empirically, we find that concatenating the entire global logit map rather than cropping the logit map at the location of the image patch yields slightly improved results.
+The designs of the coarse and fine modules are conceptually the same as those in GLNet~\cite{glnet} and MagNet~\cite{magnet}, where a global network is trained on entire downsampled images and a local network is trained on higher-resolution image patches conditioned on some global features. However, unlike GLNet, where intermediate features are shared between the global and local branch bidirectionally, we opt for a simpler late fusion by concatenating the logit map directly to the fine module. We also only use two stages instead of up to four stage as done in MagNet, since only a fine module at the highest resolution is adequate at refining annotations that are only several pixels thick, and additional intermediate stages can drastically increase inference time.
+Our model can be trained end-to-end. To do so, in the training stage, we first randomly scale, rotate, horizontal flip and apply photometric distortion to the image. The global image $I_{glo}$ is generated by simply downsampling this augmented image. We then generate the local image patch $I_{loc}$ by randomly cropping a 512$\times$512 window from the augmented image that contains at least 1\% wire pixels. This helps balance between wire and background pixels. We find that this simple constrained cropping approach yields better performances than other well known balancing methods, including Focal loss~\cite{focal} and Online Hard Example Mining~\cite{ohem}.
+We share the same feature extractor network between both the coarse and fine module, but train separate feature decoders. To account for the additional inputs to the fine module, we expand the input channels of the feature extractor from 3 to 5. The last two channels are set to 0 when passing an image through the coarse module, and set to the global logit map $Z_{glo}$ and the binary location map $M$ for the fine module. Both $Z_{glo}$ and $Z_{loc}$ are trained by computing Cross Entropy losses of the logit maps after softmax against their respective ground truth wire annotations. The final loss $L$ of our model is the sum of the global cross entropy $L_{glo}$ loss and local cross entropy loss $L_{loc}$. They are defined as follows:
+\begin{equation}
+\begin{aligned}
+    L_{glo} &= -\sum_{c\in C}log\ Softmax(Z_{glo})_c \\
+    L_{loc} &= -\sum_{c\in C}log\ Softmax(Z_{loc})_c \\
+    L &= L_{glo} + L_{loc}
+\end{aligned}
+\end{equation}
+where $C$ is the number of categories. $C=2$ (wire, background) in our task.\\
+To perform inference with our model, for each high-resolution image, we first feed the downsampled image to the coarse module, which is the same as the training step. We then compute the global wire segmentation map by taking the $\mathrm{argmax}$ over the two classes for each pixel. Local refinement is done by running a sliding window over the entire image, where the fine step is performed if there are more than some percentage $\alpha$ of wire pixels within the image window. By conditioning on the global wire probability, we save computation time in regions where there are no wires, and maintain segmentation quality at high-resolution when potential wires are discovered. By passing the global logit to the fine module, we also allow the coarse module to provide information to the fine module for better inference quality.
+\subsection{Implementation details}
+We use MixTransformer~\cite{segformer} as our shared feature extractor. We expand the backbone's input RGB channel to five channels to accept the logit map and binary location map during the local step. We define two separate MLP feature decoders from~\cite{segformer} for the coarse and fine modules respectively.
+We train our model on our training set with 2796 high-resolution images. The model is trained for 40k iterations with a batch size of 8. Global images are downsampled to 512$\times$512. We feed the global images to the coarse module and obtain a two channel class logit map. A single 1$\times$1 convolution is used to transform this logit map into a single channel map, which is then concatenated with the local RGB image and binary location mask. The five-channel input is finally fed into the fine module to obtain the local logit map. We use AdamW~\cite{adamw} with a learning rate of $0.00006$ and weight decay of $0.01$. We use the "poly" LR scheduling rule with a factor of $1.0$.
+In the testing stage, we set both the global image and local image patch size to 1024$\times$1024. Unless otherwise specified, we set the percentage for local refinement $\alpha$ to $0.01\%$.
+We also train common semantic segmentation models on our dataset for comparison. For whole image models, we train the models on downsampled whole-images, as are done in  our coarse module and most semantic segmentation methods. For sliding-window models, we train them on full-resolution image patches, and perform inference in a sliding window approach. All experiments are trained on 4 Nvidia V100 GPUs and tested on a single V100 GPU.
+% A straight-forward approach towards wire segmentation is to train a common object segmentation model on our high-resolution wire images. To avoid memory issues with large images, we adapt conventional semantic segmentation training pipelines by randomly downsampling the images to between 1024 and 2048 pixels on the longer side, while maintaining aspect ratio. Random rotation is added to increase variety of wire shapes. Random horizontal flip, photometric distortion and channel normalization are then added. To obtain the final segmentation prediction, the predicted probability map is bilinearly upsampled to the original image size, where evaluation takes place. We name this method coarse semantic segmentation, since the image downsampling and prediction upsampling steps cause the final output to have lower resolution than the original image.
+% An intuitive alternative to coarse segmentation is to train the model on full-resolution image patches, then perform inference in a sliding-window manner. This way, the model captures the most details and thus gives the tightest prediction. The trade-off with the sliding window method is the inference time. Since each full-resolution image has to be cropped into multiple patches and be predicted individually, multiple iterations of inference have to be performed, resulting in significantly prolonged inference time when images are several times larger. As a comparison, we train and evaluate the same model architecture as coarse semantic segmentation, but instead of downsampling images to below 2048 pixels per side, we simply crop the full-resolution images without downsampling. During inference, a full-resolution large image is cropped into multiple image patches of the same size, and each patch is predicted individually. The finaly prediction map is constructed by stitching the prediction patches together. We name this method fine semantic segmentation, since the prediction is not upsampled and thus gives the tightest segmentation map.
+% We show in Section~\ref{sec:results} that this method gives significantly better results than coarse semantic segmentation, but requires much longer to run.
+% \subsection{The coarse-to-fine pipeline}
+% Ideally, to perform efficient yet accurate semantic segmentation on large-resolution images, a fusion of the above two methods is most desired. Intuitively, we can use the coarse segmentation map as a condition to determine the regions in the image that require fine segmentation. To achieve this, we can divide the both the image and the coarse segmentation map into patches, and perform fine segmentation based on the result of the coarse segmentation patch, such as the percentage of wire pixels in the patch.
+% Intuitively, we can simply use the trained models from the above two methods as the "combined model" for coarse-to-fine inference. Yet, this design requires two separate models, and do not share information between them. An improvement over this approach is to train a single model with both downsampled images and full-resolution image patches, then run coarse-to-fine inference using the same model, but this still does not provide a way to share information between both models.
+% To ensure efficient inference without duplicated model structures, as well as enable information passing from the coarse inference stage to the fine inference stage, propose a sequential global-to-local network that shares the feature extractor backbone but contains two separate decoder heads. The concept can be thought of as the opposite of~\cite{glnet}, where they have separate feature extractors for global and local images. As a result, our model can be trained in an end-to-end manner, while their model requires a three-stage training scheme. Our proposed method is similar to~\cite{cascadepsp} in terms of the sequential inference pipeline, but differs in two major aspects. First, ~\cite{cascadepsp} shares the entire network for global and local inference. Second, their work is designed for segmentation refinement, meaning that their model requires a separate segmentation prediction as input, resulting in two iterations for global inference. On the other hand, our model only requires a single step in global inference.
+% \subsection{Model description}
+% Here we describe our two-stage model design and training details. Our model consists of a single shared feature extractor and two separate decoders. In our experiments, we use Mix Transformer (MiT)~\cite{segformer} as the feature extractor, and the multi-layer perceptron (MLP) decoder proposed by the same authors as the decoder structure. As mentioned above, we duplicate the decoder such that one is used for global inference and the other is used for local inference.
+% To enable information sharing between the global step and the local step, we extend the model input from three channels to five. The first three channels are the original RGB channels, and the last two channels are for the local step, and is therefore deactivated (i.e. all zeros) during the global step. In the local step, the fourth channels is the probability map of the entire downsampled image. The fifth channel is a binary map, where the location of the local image patch projected back onto the whole image is set as one, and the remaining regions as zero. Compared to~\cite{cascadepsp}, where only the probability map of the local image patch is appended to the input, using the probability map of the entire image enables the network to gain contextual information of the image patch. At the same time, the binary map allows the network to condition on the local probability, thus performing accurate segmentation refinement. An illustration of our model design can be seen in Figure.~\ref{fig:model}.
+% \textbf{stress on the nice part of the model: shared weights, size, etc...}
+% \section{Toward Collecting High-resolution Wire Dataset}
+% \subsection{High-resolution Wire Dataset}
+% Albeit having major incentives towards distractor removal, few works exist that tackle wire segmentation in high-resolution outdoor images. To this end, we present a dataset consisting of xxx very high-resolution outdoor images with wires and cables. Image sizes in our dataset varies from X$\times$Y to W$\times$Z. Wire images in our dataset not only include power lines and cables as in~\cite{ttpla}, but also electrical wires attached to buildings and other structures. This adds significant variety to the types of wires in terms of color and lighting. In addition, while ~\cite{ttpla} contains power line images mostly from close distances, wires in our dataset images can appear from very far distances, which makes them extremely difficult to recognize even with careful visual inspection, to extreme close-ups where textures are clearly visible. This extreme pattern variety makes it very difficult for the model to infer from a single visual cue, such as color gradients.
+% Table.\ref{table:dataset} shows statistics of our dataset. As can be seen, our dataset contains images that are significantly larger than other similar wire/cable datasets. (add stuff when stats and comparisons are done)

paper-tex/sections/method_yq.tex ADDED Viewed

	@@ -0,0 +1,111 @@

+%auto-ignore
+% \section{Method} \label{sec:method}
+% In this section, we describe our two-stage model architecture. Figure~\ref{fig:pipeline} shows the overall model structure. We will first discuss motivations of our two-stage design, then describe our model architecture and training details.
+%In this section, we first discuss motivations of our two-stage model architecture, then describe our model architecture and training details. Figure~\ref{fig:pipeline} shows the overall model structure.
+\section{High-Resolution Wire Segmentation}
+\label{sec:segmentation}
+Wires appear visually different from common objects -- being thin, long, sparse and oftentimes partially occluded.
+We find the following two design choices crucial to building an effective wire segmentation system: 1) having a two stage framework so that coarse prediction from global context guides precise segmentation from local patches and 2) maximally preserving and augmenting image features and annotations of wires throughout the pipeline.
+% We thus propose a two-stage coarse-to-fine pipeline, in which we inject feature augmentation and architectural choices to maximally preserve the wire features throughout.
+\subsection{The Two-stage Coarse to Fine Model}
+%\yqe{
+Figure \ref{fig:pipeline} shows the two-stage segmentation pipeline. It consists of a coarse and a fine module, which share an encoder $E$ and have their own decoder $D_C$ and $D_F$. Intuitively, the coarse module aims to capture the global contextual information from the entire image and highlight the image regions possibly containing wires. Conditioned on the predictions from the coarse module, the fine module achieves high-resolution wire segmentation by only looking at local patches likely containing wires.
+Given a high-resolution image $I_\textrm{glo}$, we first bilinearly downsample it to $I_\textrm{glo}^{ds}$ with a fixed size $p\times p$ and feed it into the coarse module. The module predicts the global probability map $P_\textrm{glo} = \textrm{SoftMax}(D_C(E(I_\textrm{glo}^{ds})))$ containing the activation of the wire regions.% with rich global contextual information.
+For each patch $I_\textrm{loc}$ of size $p \times p$ cropped from the full-resolution image $I_\textrm{glo}$, and the corresponding conditional probability map $P_\textrm{con}$ cropped from $P_\textrm{glo}$, we predict the local probability $P_\textrm{loc} = \textrm{SoftMax}(D_F(E(I_\textrm{loc}, P_\textrm{con})))$.
+%$M$ is a binary location mask indicating the patch region of $I_{loc}$ in $I_{glo}$ with 1, and elsewhere 0.
+Note that $E$ is shared between the coarse and the fine module, thus it should take inputs with the same number of channels. Therefore, for the coarse module, we concatenate an additional zero channel with the input image to make the channel number consistent.
+% We formulate our problem as a semantic segmentation task, so we can regard it as a pixel-wise classification problem. Each image pixel is labeled as 0 or 1 indicating background or wire pixel.
+% We apply Cross Entropy (CE) loss to both the global $P_\textrm{glo} = \textrm{SoftMax}(Z_\textrm{glo})$ and local probability map $P_\textrm{loc} = \textrm{SoftMax}(Z_\textrm{loc})$, comparing with their ground truth annotations $G_\textrm{glo}$ and $G_\textrm{loc}$.
+We apply Cross Entropy (CE) loss to both the global $P_\textrm{glo}$ and local probability map $P_\textrm{loc}$, comparing with their ground truth annotations $G_\textrm{glo}$ and $G_\textrm{loc}$.
+\vspace{-2mm}
+\begin{equation}
+\begin{aligned}
+%    \mathcal{L}_{glo} &= -\sum_{c\in C}\log\ Softmax(Z_{glo})_c \\
+    %\mathcal{L}_{loc} &= -\sum_{c\in C}\log\ Softmax(Z_{loc})_c \\
+    \mathcal{L}_\textrm{glo} &= CE (P_\textrm{glo}, G_\textrm{glo}) \\
+    \mathcal{L}_\textrm{loc} &= CE (P_\textrm{loc}, G_\textrm{loc}) \\
+\end{aligned}
+\end{equation}
+%Both $Z_{glo}$ and $Z_{loc}$ are trained by computing Cross Entropy losses of the logit maps after softmax against their respective ground truth wire annotations.
+The final loss $\mathcal{L}$ is the sum of the two:
+\vspace{-3mm}
+\begin{equation}
+\begin{aligned}
+    \mathcal{L} &= \mathcal{L}_ \textrm{glo} + \lambda \mathcal{L}_ \textrm{loc},
+\end{aligned}
+\end{equation}
+%Given a high-resolution image $I$, the coarse module of our two-stage model aims to recognize semantics of the entire image and pick up wire regions at a coarse level. Looking at the entire image allows the coarse module to capture more contextual information. To fit the large images into a GPU for training and inference, images are first bilinearly downsampled, then fed into the coarse module to predict a global logit map $Z_{glo}$. We use $Z_{glo}$ as a conditional input to guide the fine module in the next stage.
+%The fine module is conditioned on the output from the coarse module. This module takes in a local image patch $I_{loc}$ of the whole image at full resolution, the global logit map $Z_{glo}$ from the coarse module, and a binary location mask $M$ that sets the patch relative to the full image to $1$ and other regions to $0$. The fine module then predicts the local wire logit map $Z_{loc}$. Empirically, we find that concatenating the entire global logit map rather than cropping the logit map at the location of the image patch yields slightly improved results.
+%The designs of the coarse and fine modules are conceptually the same as those in GLNet~\cite{glnet} and MagNet~\cite{magnet}, where a global network is trained on entire downsampled images and a local network is trained on higher-resolution image patches conditioned on some global features. However, unlike GLNet, where intermediate features are shared between the global and local branch bidirectionally, we opt for a simpler late fusion by concatenating the logit map directly to the fine module. We also only use two stages instead of up to four stage as done in MagNet, since a single fine module is already sufficient at refining annotations that are only several pixels thick, and additional stages can drastically increase inference time.
+where we
+%empirically
+set $\lambda = 1$ for training.
+% During training,
+% we apply data augmentation including random scaling, horizontal flipping and photometric distortion to the full-resolution image to obtain the input $I_{glo}$. After that,
+% we randomly crop one local patch $I_{loc}$ with patch size $p=512$ from the augmented image.
+Similar to Focal loss~\cite{focal} and Online Hard Example Mining~\cite{ohem}, we balance the wire and background samples in the training set by selecting patches that contain at least 1\% of wire pixels. %We find that this simple constrained cropping approach yields better performances than other well known balancing methods, including Focal loss~\cite{focal} and Online Hard Example Mining~\cite{ohem}.
+%The global image $I_{glo}$ is generated by simply downsampling this augmented image. We then generate the local image patch $I_{loc}$ by randomly cropping a 512$\times$512 window from the augmented image that contains at least 1\% wire pixels. This helps balance between wire and background pixels. We find that this simple constrained cropping approach yields better performances than other well known balancing methods, including Focal loss~\cite{focal} and Online Hard Example Mining~\cite{ohem}.
+%We share the same feature extractor network between both the coarse and fine module, but train separate feature decoders.
+%To account for the additional inputs to the fine module, we expand the input channels of the feature extractor from 3 to 5. The last two channels are set to 0 when passing an image through the coarse module, and set to the global logit map $Z_{glo}$ and the binary location map $M$ for the fine module.
+To perform inference, we first feed the downsampled image to the coarse module, which is the same as training.
+% We then compute the global wire segmentation map by taking the $\mathrm{argmax}$ over the two classes for each pixel.
+Local inference is done by running a sliding window over the entire image, where the patch is sampled only when there is at least some percentage of wire pixels (determined by~$\alpha$). This brings two advantages: First, we save computation time in regions where there are no wires. Second, the local fine module can leverage the information from the global branch for better inference quality.
+% The design of our two-stage pipeline is conceptually similar to GLNet~\cite{glnet} and MagNet~\cite{magnet}, where a global network is trained on downsampled images and a local network is trained on image patches at the original resolution. However, unlike GLNet whose intermediate features are shared between the global and local branch bidirectionally, we opt for a simpler late fusion by concatenating the probability map directly to the fine module. We only use two stages instead of four stages in MagNet. Our pipeline demonstrates its efficiency and simplicity while coping with wire-like objects which are only several pixels thick, and avoids high computational demands from some previous works.
+\subsection{Wire Feature Preservation}
+\label{sec:wire_feature_preserve}
+As wires are thin and sparse, applying downsampling to the input images may make the wire features vanish entirely. To mitigate this challenge, we propose a simple feature augmentation technique by taking the min and max pixel luminance values of the input image over a local window. Either the local min or the max value makes the wire pixels more visually apparent. In practice, we concatenate the min- and max-filtered luminance channels to the RGB image and condition map, resulting in 6 total channels as input. We name this component MinMax.
+%to form a 5-channel input.
+% Unlike common object segmentation, with prominent textures and colors, wires, especially outdoor cables, tend to either be shadowed or reflective under sunlight. As a result, it is common to see either very bright or very dark wires in images. An intuitive approach is to find the min and max of an image before downsampling, and preserve this feature as input to the model.
+% \todo{show an example patch of luminance min max filter:}
+%\subsection{Motivation}
+% We design a two-stage coarse-to-fine pipeline for wire-like object segmentation. This model structure
+% we divide our model into two major components -- a coarse module and a fine module. Our two-stage model design
+% is motivated by two observations. First, wires in our dataset are extremely long and thin, many spanning over several thousand pixels but only occupy several pixels across. Limited by the memory size of modern GPUs, we cannot simply pass the entire image at full-resolution to a model for inference. As a result, two separate modules are required for high-resolution inference, where the coarse module captures the entire wire and its surrounding contextual information at a lower resolution, and the fine module captures detailed textures of the wire at the original resolution.
+% Second, we observe that wire pixels are sparse, where a typical high-resolution image contains a small percentages of wire pixels. This means that we can also use the coarse module to guide the fine module on what regions to capture the details. The fine module can save computation by only predicting segmentation masks where there are wires.
+% \subsection{Global Condition}
+% Many high-resolution segmentation methods share the idea of using global-local refinement module. But each method differs in details that are tailored to their targeted applications.
+% MagNet~\cite{magnet} trains a single model by randomly sampling global/local patches, and then trains a refinement module that takes as input the two predictions. This neglects the causal relationship between global and local features. The refinement module does not take into account any image feature. The recently proposed ISDNet~\cite{isdnet} use a shallow network and takes the entire image as input. This is proven effective in some high-resolution segmentation datasets such as DeepGlobe~\cite{deepglobe}, where resolution is fixed and manageable (5k$\times$5k). But the model is unable to scale to larger images. The shallow model, while being efficient, is limited in model capacity, especially with sparse labels.
+% the fact that they take the entire image as input strictly limits their model to use lightweight networks as the shallow branch (i.e. STDC~\cite{stdc} with output stride=8), which has limited performance in cases where the target label is also small.
+% In fact, we were unable to train ISDNet with a stronger shallow network to produce high-resolution outputs.
+% ~\cite{learning_downsample} attempts to learn a downsampling network, which we tried but found was detrimental for our thin and sparse wire masks.
+% As a result,
+Besides feature augmentations, we also adapt the architecture to maximally preserve the sparse wire annotations.
+% Different from existing methods, we find it crucial to keep the image at a close-to-original resolution to maximally preserve the image features of wires. In addition, we find it similarly important to preserve the sparse wire annotations.
+% We use the global branch as a conditioning network for training both coarse and high resolution segmentation tasks rather than a simple coarse branch that is only learned for the coarse resolution.
+We propose to use ``overprediction" and achieve this by using max-pool downsampling on the coarse labels during training, which preserves activation throughout the coarse branch. We name this component MaxPool. We provide ablation studies for these components in Section~\ref{sec:results}.
+\vspace{-5mm}
+\section{High-Resolution Wire Inpainting}
+\label{sec:inpainting}
+Given a full-resolution wire segmentation mask estimated by our wire segmentation model, we propose an inpainting pipeline to remove and fill in the wire regions. Our approach addresses two major challenges in wire inpainting. First, recent state-of-the-art deep inpainting methods do not handle arbitrary resolution images, which is critical for high-resolution wire removal. Second, deep inpainting methods often suffer from color inconsistency when the background has uniform (or slowly varying) colors. This issue is particularly significant for wires, as they are often in front of uniform backgrounds, such as the sky or building facades. The commonly used reconstruction loss, such as L1, is not sensitive to color inconsistency, which further exacerbates this issue.
+We thus revisit the efficient deep inpainting method LaMa \cite{suvorov2022resolution}. Compared with other inpainting models, LaMa has two major advantages. First, it contains the Fourier convolutional layers which enables an efficient and high-quality structural completion. This helps complete building facades and other man-made structures with fewer artifacts. Second, its high inference efficiency makes a tile-based inference approach possible for high resolution images.
+% To tailor LaMa for wire removal, we fix the input size of the model to $512\times512$ and train the model on an augmented dataset by including synthetic wire masks and cropped patches from full-resolution images.
+To address color inconsistency, we propose a novel ``onion-peel" color adjustment module. Specifically, we compute the mean of the RGB channels within the onion-peel regions $M_o = D(M, d) - M$ of the wire mask $M$, where $D$ is the binary dilation operator, and $d$ is the kernel size. The color difference for each channel $c \in {R, G, B}$ becomes $\textrm{Bias}_c = \mathbb{E}[M_o (x_c - y_c)]$, where $x$ is the input image, and $y$ is the output from the inpainting network. The final output of the inpainting model is: $\hat{y_c} = y_c + \textrm{Bias}_{c}$. Loss functions are then applied to $\hat{y_c}$ to achieve color consistency while compositing the final result $y_{out} = (1 - M) \odot x + M \odot \hat{y}$.
+% While running inference on full-resolution images, we apply a tile-based approach, by fixing the window size at $512\times 512$ with an $32$-pixel overlap.
+% This makes the model consistent in training and testing settings, and gives good textural and structural details for local regions.

paper-tex/sections/related_work.tex ADDED Viewed

	@@ -0,0 +1,36 @@

+%auto-ignore
+\section{Related Work} \label{sec:related}
+\input{figure_tex/motivation}
+% \textcolor{blue}{1. semantic segemtnation,however, as we will discuss, directly using these methods will nto help.  , 3. high resolution: xxx, - directly using them is not working 3. wire segmentnion.}
+%We introduce related works in the following way. We first describe several widely-used semantic segmentation methods, followed by methods that tackle more specific challenges such as high-resolution segmentation and wire segmentation. We describe key principles of these works, where we can draw inspirations and improve upon.
+\paragraph{Semantic segmentation}
+Semantic segmentation has been actively researched over the past decade. For example, the DeepLab series~\cite{deeplab, deeplabv3, deeplabv3p} has been one of the most widely used set of semantic segmentation methods. They leverage dilated convolutions to capture long-range pixel correlations. Similarly, CCNet~\cite{ccnet} attend to non-local regions via a two-step axial attention mechanism. PSPNet~\cite{pspnet} use multi-scale pooling to extract high-resolution features.
+Recently, the self-attention mechanism~\cite{attention} has gained increasing popularity.
+% Originally applied in Natural Language Processing tasks, its vision-based variants~\cite{vit, swin} have shown to be superior to traditional convolution-based methods.
+Transformer-based models for semantic segmentation~\cite{dpt, setr, swin, segformer, hassani2022neighborhood, hassani2022dilated, jain2021semask, jain2022oneformer} significantly outperform convolution-based networks since the attention modules benefit from their global receptive fields~\cite{segformer}, which let the models attend to objects that span across larger portions of the feature map.
+While these above methods work well in common object semantic segmentation, when applied to our task of wire segmentation in high-resolution images, they either drop significantly in segmentation quality or require long inference times. We show in Section~\ref{sec:results} that directly applying these methods to our task yields undesirable results.
+\vspace{-5mm}
+\paragraph{High-resolution image segmentation}
+Segmentation in high-resolution images involves additional design considerations. It is computationally infeasible to perform inference on the full-resolution image with a deep network. As a result, to maximally preserve image details within the available computation resources, many methods employ a global-local inference pipeline. For instance, GLNet~\cite{glnet} simultaneously predict a coarse segmentation map on the downsampled image and a fine segmentation map on local patches at the original resolution, then fuse them to produce the final prediction.
+%Their model shares features produced by both the global and local branch, thus achieving feature fusion.
+MagNet~\cite{magnet} is a recent method that proposes to iteratively predict and refine coarse segmentation maps at multiple scales using a single feature extractor and multiple lightweight refinement modules. CascadePSP~\cite{cascadepsp} train a standalone class-agnostic model to refine predictions at a higher resolution from a pretrained segmentation model. ISDNet~\cite{isdnet} propose to use an extremely lightweight subnetwork to take in the entire full-resolution image. However, the subnetwork is limited in capacity and thus segmentation quality. We share the same idea with these past works on using a coarse-to-fine approach for wire segmentation, but modify the architecture and data processing to tailor to wires.
+% Most of these methods share a similar coarse-to-fine approach, where instead of performing a single inference on high-resolution images, they divide them into a coarse-to-fine approach. We repurpose important components of these methods into a two-stage model design optimized for wire segmentation in high-resolution images.
+%While our model pipeline is similar to the above methods, there are a few fundamental differences that makes our pipeline more effective and efficient in our task. First, GLNet essentially uses two separate networks for their global and local branches. The combined network does not share weights, and requires a three-stage training scheme. Meanwhile, CascadePSP only does segmentation refinement, which means they require an entire separate network to produce the initial prediction. MagNet uses an iterative refinement method at multiple scales, where the inference time scales quadratically with number of refinement scales. They also use a much smaller module for refinement, which limits quality of refinement. In contrast, our proposed network shares the feature extractor and can be trained end-to-end. Our model is also capable of predicting accurate wire masks on very high-resolution images with only two stages and without requiring any separately trained model for initial prediction.
+\vspace{-5mm}
+\paragraph{Wire/Curve segmentation}
+While few works tackle wire segmentation in high-resolution images, there are prior works that handle similar objects. For example, Transmission Line Detection (TLD) is an actively researched area in aerial imagery for drone applications. Convolutional neural networks are used~\cite{ttpla, pldu, cable_inst, lsnet} to segment overhanging power cables in outdoor scenes. However, wire patterns in TLD datasets are relatively consistent in appearance and shape -- evenly spaced and only spanning locally. In contrast, we handle more generic wires seen in regular photographic contents, where the wire appearance has much higher variety.
+%We will discuss in Section~\ref{sec:results} that the models from these works do not generalize to wires in our task.
+Some other topics are loosely related to our task. Lane detection~\cite{lanedet,swiftlane,structurelane} aims to segment lanes for autonomous driving applications. These methods benefit from simple line parameterization (e.g., as two end-points), and strong positional priors. In contrast, as shown in Figure~\ref{fig:motivation}, wires vary drastically in shapes and sizes in our task, thus making them difficult to parameterize.
+\vspace{-5mm}
+\paragraph{High-Resolution Image Inpainting}
+Image inpainting has been well-explored using patch synthesis-based methods \cite{barnes2009patchmatch, wexler2007space, darabi2012image, kaspar2015self} or deep neural networks \cite{contextencoder, globallocal, partialconv, contextual, yu2019free, xu2022image}. Zhao \textit{et al.} leveraged the powerful image sysnthesis ability of StyleGAN2 \cite{karras2020analyzing} and proposed CoModGAN \cite{comodgan} to push the image generation quality to a newer level, and was followed by \cite{zheng2022cm, jain2022keys}. Most of these deep models cannot be applied to inpainting tasks at high-resolution images. The latest diffusion-based inpainting model like DALLE-2 \cite{dalle}, LDM \cite{rombach2022high}, and StableDiffusion etc. also suffer from long inference time and low output resolution. ProFill \cite{zeng2020high} was first proposed to address high resolution inpainting via a guided super resolution module. HiFill \cite{hifill} utilized a contextual residual aggregation module and the resolution can be up to 8K. LaMa \cite{suvorov2022resolution} applied the fourier convoluational residual blocks to make the propagation of image structures well. LaMa was trained on only $256 \times 256$ images, but can be used for images up to 2K with high quality. Recently, Zhang \textit{et al.} \cite{supercaf} proposed to use guided PatchMatch for any-resolution inpainting and extended the deep inpainting results from LaMa to modern camera resolution. The textures are better reused, while the structure and line completion at high-resolution can still be challenging. In this paper, we aim at removing wires from high resolution photos. The problem can become easier if we run inpainting in a local manner since wires are usually thin and long. Therefore, we propose to revisit LaMa for wire removal, and run the inference in a tile-based fashion.

paper-tex/sections/results.tex ADDED Viewed

	@@ -0,0 +1,156 @@

+%auto-ignore
+\section{Experiments} \label{sec:results}
+\subsection{Implementation Details}
+\paragraph{Wire Segmentation Network.}
+We experiment with ResNet-50~\cite{resnet} and MixTransformer-B2~\cite{segformer} as our shared feature extractor. We expand the input RGB channel to six channels by concatenating the conditional probability map, min- and max-filtered luminance channels. For the min and max filtering, we use a fixed 6x6 kernel. We use separate decoders for the coarse and fine modules, denoted as $D_C$ and $D_F$ respectively.
+%\textcolor{purple}{
+We use the MLP decoder proposed in~\cite{segformer} for the MixTransformer segmentation model, and the ASPP decoder in~\cite{deeplabv3p} for our ResNet-50 segmentation model. In both the segmentation and inpainting modules, we take the per-pixel average of the predicted probability when merging overlapping patches. To crop $P_\mathrm{con}$ from $P_\mathrm{glo}$, we upsample the predicted $P_\mathrm{glo}$ to the original resolution, then crop the predicted regions according to the sliding window position.
+%}
+%\textcolor{purple}{
+To train the segmentation module, we downsample the image $I_\mathrm{glo}$ to $p\times p$ to obtain $I_\mathrm{glo}^\mathrm{ds}$. From $I_\mathrm{glo}$, we randomly crop one $p\times p$ patch $I_\mathrm{loc}$ that contains at least 1\% wire pixels. This gives a pair of $I_\mathrm{glo}^\mathrm{ds}$ and $I_\mathrm{loc}$ to compute the losses. During inference, $I_\mathrm{glo}^\mathrm{ds}$ is obtained in the same way as training, while multiple $I_\mathrm{loc}$ are obtained via a sliding window sampled only when the proportion of wire pixels is above $\alpha$. All feature extractors are pretrained on ImageNet.
+%, while the decoders are trained from scratch.
+%}
+We train our model on 5000 training images. The model is trained for 80k iterations with a batch size of 4. We set patch size $p = 512$ during training.
+%Global images are downsampled to 512$\times$512. We feed the global images to the coarse module and obtain a two channel class logit map. A single 1$\times$1 convolution is used to transform this logit map into a single channel map, which is then concatenated with the local RGB image and binary location mask. The five-channel input is finally fed into the fine module to obtain the local logit map.
+For all ResNet models, we use SGD with a learning rate of 0.01, a momentum of 0.9 and weight decay of 0.0005. For MixTransformer models, we use AdamW~\cite{adamw} with a learning rate of 0.0002 and weight decay of 0.0001. Our training follows the ``poly" learning rate schedule with a power of 0.9. During inference, we set both the global image size and local patch size $p$ to 1024. Unless otherwise specified, we set the percentage for local refinement to $1\%$ ($\alpha=0.01$).
+%In this section, we provide a set of quantitative results and comparisons between our two-stage model and other methods. We then provide ablation studies, qualitative visualizations and discuss several failure cases. For additional results and details, please refer to our supplementary materials.
+%\todo{illurstrate the sampling strategy in training and inference strategy here?}
+\vspace{-4mm}
+\paragraph{Wire Inpainting Network.}
+We adopt LaMa \cite{suvorov2022resolution} for wire inpainting by finetuning on an augmented wire dataset.
+To prepare the wire training set, we randomly crop ten $680\times680$ patches from the non-wire regions of each image in our training partition. In total, we have 50K more training images in addition to
+%\textcolor{purple}{
+the 8M
+%}
+Places2 \cite{zhou2017places} dataset, and increase its sampling rate %\textcolor{purple}{
+by $10\times$
+%}
+to balance the dataset. We also use all the ground truth segmentation maps in our training set to sample wire-like masks. During training, we start from Big-LaMa weights, and train the model on $512\times 512$ patches. We also prepare a synthetic wire inpainting quality evaluation dataset, containing 1000 images at $512\times 512$ with synthetic wire masks.
+%containing 1000 images of $512 \times 512$ and synthetic wire masks on that.
+While running inference on full-resolution images, we apply a tile-based approach, by fixing the window size at $512\times 512$ with an $32$-pixel overlap.
+\subsection{Wire Segmentation Evaluation}
+\paragraph{Quantitative Evaluation}
+%\subsection{Globally Conditioned Inference}
+We compare with several widely-used object semantic segmentation and high-resolution semantic segmentation models. Specifically, we train DeepLabv3+~\cite{deeplabv3p} with ResNet-50~\cite{resnet} backbone under two settings: global and local. In the global setting, the original images are resized to 1024$\times$1024. In the local setting, we randomly crop 1024$\times$1024 patches from the original images. We train our models on 4 Nvidia V100 GPUs and test them on a single V100 GPU. For high-resolution semantic segmentation models, we compare with CascadePSP~\cite{cascadepsp}, MagNet~\cite{magnet} and ISDNet~\cite{isdnet}. We describe the training details of these works in the supplement.
+We present the results of in Table~\ref{table:results} tested on \benchmark. We report wire IoU, F1-score, precision and recall for quantitative evaluation. We also report wire IoUs for images at three scales, small (0 -- 3000$\times$3000), medium (3000$\times$3000 -- 6000$\times$6000) and large (6000$\times$6000+), which are useful for analyzing model characteristics. Finally, we report average, minimum and maximum inference times on \benchmark.
+\input{tables/results}
+As shown in Table~\ref{table:results}, while the global model runs fast, it has lower wire IoUs. In contrast, the local model produces high-quality predictions, but requires a very long inference time.
+Meanwhile, although CascadePSP is a class-agnostic refinement model designed for high-resolution segmentation refinement, it primarily targets common objects and does not generalize to wires.
+% \textcolor{red}{\sout{We thus retrain CascadePSP on our dataset but find their data perturbation does not realistic model coarse wire segmentation, thus cannot effectively conduct refinement.}}
+For MagNet, its refinement module only takes in probability maps without image features, thus failing to refine when the input prediction is inaccurate.
+% As a result, the refinement module cannot accurately produce high-resolution wire predictions.
+Among these works, ISDNet is relatively effective and efficient at wire prediction.
+% In fact, their inference time is on par with the global network even at high image resolution, while maintaining relatively high wire IoU.
+However, their shallow network design trades off capacity for efficiency, limiting the performance of wire segmentation that is thin and sparse.
+% \textcolor{red}{\sout{For a fair comparison, we tried to replace the shallow branch in ISDNet with a MixTransformer backbone but failed due to GPU memory limitation. We thus only replace their deep network module, which only yields minor improvement, as shown in Table~\ref{table:results}.}}
+% \textcolor{purple}{
+Compared to the methods above, our model achieves the best trade-off between accuracy and memory consumption. By leveraging the fact that wires are sparse and thin, our pipeline captures both global and local features more efficiently, thus saving a lot of computation while maintaining high segmentation quality.
+% }
+% There are two main reasons for this poor performance.
+% First, the global model performs inference on downsampled images, which leads to degraded image quality around thin wire-like objects and causes imprecise or disjointed predictions. Second, since prediction maps are upsampled to the original resolution, the final segmentation map may contain artifacts such as aliasing and over-predicted regions.
+% \mt{
+% First, since whole-image models predict segmentation maps on downsampled images, predictions on extremely small wires may be incomplete or of low quality. Second, whole-image segmentation maps are obtained by bilinearly upsampling the model output which leads to loose predictions.
+% }
+% These issues are rare in common object semantic segmentation since their maximum downsampling rate is usually no greater than 3$\times$, while the downsampling rate in our task can reach 10$\times$ (e.g. downsampling 10k$\times$10k images to 1024$\times$1024.). As a result, common object semantic segmentation methods fail to maintain their performances in our task.
+\input{figure_tex/comparison}
+% Meanwhile, local models yield higher IoUs, but have significantly longer inference times. This is because in order to preserve resolutions of very large images, many iterations of sliding-window have to be performed (e.g. 25 iterations are required for an image with size 5120$\times$5120 and a window size of 1024$\times$1024), which leads to significantly slower inference speeds.
+% In contrast, our two-stage model achieves better results than sliding-window models while only requiring less than half the inference time. The coarse module in our two-stage model determines potential wire regions for refinement, and skips the refinement step on regions with no wires. This saves inference time and increases the overall inference speed. In regions where there are wires, the fine module leverages information from the coarse module to more accurately predict a tight segmentation mask. These two factors together yield an effective and efficient model for wire semantic segmentation on high-resolution images.
+\vspace{-4.5mm}
+% \subsubsection{Comparing with SOTA}
+\paragraph{Qualitative Evaluation}
+%\todo{section subject to change}\\
+We provide visual comparisons of segmentation models in Figure~\ref{fig:visual}. We show the ``local'' DeepLabv3+ model as it consistently outperforms its ``global'' variant given that ``local'' predicts wire masks in a sliding-window manner at the original image resolution. As a trade-off, without global context, the model suffers from over-prediction. CascadePSP is designed to refine common object masks given a coarse input mask, thus fails to produce satisfactory results when the input is inaccurate or incomplete. Similarly, the refinement module of MagNet does not handle inaccurate wire predictions. ISDNet performs the best among related methods, but the quality is still unsatisfactory as it uses a lightweight model with limited capacity. Compared to all these methods, our model captures both global context and local details, thus producing more accurate mask predictions.
+\vspace{-4mm}
+\paragraph{Ablation Studies}
+% \subsubsection{Effectiveness of global logit map}
+% We show that using the global logit map as input to the fine module conveys more contextual information effectively than using the local logit map. For comparison, we train a separate two-stage model, where we only crop and resize the logit map at the location of the local image patch as input to the fine module.
+% \input{tables/logit}
+% As shown in Table~\ref{table:logit}, using only the local logit map yields inferior performances. Specifically, we find that in situations where local refinement is taken place, the global logit map provides sufficient information for the fine module to identify confusing non-wire objects, thus avoiding over-prediction. Figure~\ref{fig:overpredict} demonstrates that our fine module with global logit map input successfully avoids a pattern on the building that strongly resembles a wire.
+% \input{figure_tex/overpredict}
+In Table~\ref{table:component_ablation}, we report wire IoUs after removing each component in our model, including MinMax, MaxPool, and Coarse condition concatenation. We find that all components play a significant role for accurate wire prediction, particularly in large images. Both MinMax and MaxPool are effective in encouraging prediction, which is shown by the drop in recall without either component, also shown in Figure~\ref{fig:visual}. Coarse condition, as described in Section~\ref{sec:segmentation}, is crucial in providing global context to the local network, without which the wire IoU drops significantly.
+Table~\ref{table:thresholds} shows the wire IoUs and inference speed of our two-stage model as $\alpha$ changes. We observe a consistent decrease in performance as $\alpha$ increases. On the other hand, setting $\alpha$ to 0.01 barely decreases IoU, while significantly boosting inference speed, which means the coarse network is effectively activated at wire regions.
+% We believe that in addition to providing a condition for refinement, the coarse module also acts as a suppressor to eliminate false positives, it does so by skipping potential false positives that would otherwise be mis-classified as wires. As a result, setting the optimal $\alpha$ allows the network to predict accurate segmentation masks, avoid over-prediction, and maintain a high inference speed all at the same time.
+% Note that when $\alpha=0.0\%$ (refine on all windows), our model still outperforms a single sliding-window model (70.3\% vs. 69.0\%). This means that the coarse module indeed provides useful information to the fine module via the logit map, which further justifies our two-stage design.
+% \subsubsection{Effectiveness of model components}
+% We find that the global context provides vital information to the local branch during inference to avoid false positives. When the global branch softmax is fed into the local branch together with the local image, the network suppresses predictions at regions that look like wires, such as pavement cracks. We show these in Figure X. Quantitatively, the wire IoU drops significantly without conditioning on the global branch.
+% \subsubsection{Effectiveness of Maxpool resize}
+% \subsubsection{Effectiveness of MinMax input}
+% Here we show the useful information from the minmax image. Figure X compares a model trained with/without minmax. As can be seen, for extremely bright/dark wires, minmax is able to emphasize this feature and ensure prediction in those areas where they are not easily seen after downsampling.
+\subsection{Wire Inpainting Evaluation}
+\vspace{-1mm}
+We evaluate our wire inpainting model using the synthetic dataset. Results are shown in Table~\ref{exp:wire_inp}. Our model structure is highly related to LaMa~\cite{suvorov2022resolution}. The difference is the training data and the proposed color adjustment module to address color inconsistency. We also compare our methods with PatchMatch \cite{barnes2009patchmatch} based on patch synthesis, DeepFillv2~\cite{yu2019free} based on Contextual Attention, CMGAN~\cite{zheng2022cm} and FcF~\cite{jain2022keys} based on StyleGAN2~\cite{karras2020analyzing} and LDM~\cite{rombach2022high} based on Diffusion. Inference speed is measured on a single A100-80G GPU. Visual results on synthetic and real images are shown in Figure \ref{fig:wire_inp}. PatchMatch, as a traditional patch synthesis method, produces consistent color and texture that leads to high PSNR. However, it performs severely worse on complicated structural completion. StyleGAN-based CMGAN and FcF are both too heavy for wires that are thin and sparse. Besides, diffusion-based models like LDM tends to generate arbitrary objects and patterns. DeepFill and the official Big-LaMa both have severe color inconsistency issue, especially in the sky region. Our model has a balanced quality and efficiency, and performs well on structural completion and color consistency.
+Note that we use a tile-based method at inference time.
+% with a window size of $512 \times 512$ and an overlap of $32$.
+The reason the tile-based strategy can be employed is due to the wire characteristics: sparse, thin and lengthy. More high-resolution inpainting results are in the supplementary materials.
+% \begin{figure}[h!]
+% \centering
+% \captionsetup{type=figure}
+% \includegraphics[width=1.\linewidth]{figures/inpainting_result.pdf}
+% \vspace{-6mm}
+% \captionof{figure}{\textbf{Inpainting Comparison}. Our model performs well on complicated structure completion and color consistency, especially on building facades and sky regions containing plain and uniform color.
+% \vspace{-3mm}
+% }
+% \label{fig:wire_inp}
+% \end{figure}
+% \begin{table}[t]\setlength{\tabcolsep}{5pt}
+% \setlength{\abovecaptionskip}{8pt}
+% \centering
+% \footnotesize
+% % \scriptsize
+% % \tiny
+% %\vspace{-2ex}
+% % \resizebox{\columnwidth}{!}{
+% \begin{tabular}{r|c c c|c}
+% \hline
+% Model &PSNR$\uparrow$&LPIPS$\downarrow$&FID$\downarrow$ &Speed (s/img)\\ \hline
+% %Photoshop\\ %should be easy to run batch testing
+% PatchMatch \cite{barnes2009patchmatch}&50.29 &0.0294 & 5.0403 & -\\
+% DeepFillv2 \cite{yu2019free} &47.01 &0.0374&8.0086 &0.009\\
+% CMGAN \cite{zheng2022cm} &50.07 &0.0255 &3.8286 &0.141\\
+% FcF \cite{jain2022keys}&48.82&0.0322&4.7848&0.048\\
+% LDM \cite{rombach2022high} & 45.96 & 0.0401& 10.1687 & 4.280\\
+% Big-LaMa \cite{suvorov2022resolution} & 49.63 & 0.0267& 4.1245 &0.034\\
+% Ours (LaMa-Wire) & 50.06 & 0.0259 & 3.6950 &0.034\\
+% \hline
+% \end{tabular}
+% \caption{Quantitative results of inpainting on our synthetic wire inpainting evaluation dataset (1000 images). Our model achieves the highest perceptual quality in terms of FID, and has a balanced speed and quality.}
+% % }
+% \label{exp:wire_inp}
+% % \vspace{-4mm}
+% \end{table}

paper-tex/supplement.bbl ADDED Viewed

	@@ -0,0 +1,24 @@

+\begin{thebibliography}{1}\itemsep=-1pt
+\bibitem{cascadepsp}
+Ho~Kei Cheng, Jihoon Chung, Yu-Wing Tai, and Chi-Keung Tang.
+\newblock Cascadepsp: toward class-agnostic and very high-resolution
+  segmentation via global and local refinement.
+\newblock In {\em Proceedings of the IEEE/CVF Conference on Computer Vision and
+  Pattern Recognition}, pages 8890--8899, 2020.
+\bibitem{isdnet}
+Shaohua Guo, Liang Liu, Zhenye Gan, Yabiao Wang, Wuhao Zhang, Chengjie Wang,
+  Guannan Jiang, Wei Zhang, Ran Yi, Lizhuang Ma, et~al.
+\newblock Isdnet: Integrating shallow and deep networks for efficient
+  ultra-high resolution segmentation.
+\newblock In {\em Proceedings of the IEEE/CVF Conference on Computer Vision and
+  Pattern Recognition}, pages 4361--4370, 2022.
+\bibitem{magnet}
+Chuong Huynh, Anh~Tuan Tran, Khoa Luu, and Minh Hoai.
+\newblock Progressive semantic segmentation.
+\newblock In {\em Proceedings of the IEEE/CVF Conference on Computer Vision and
+  Pattern Recognition}, pages 16755--16764, 2021.
+\end{thebibliography}

paper-tex/supplement.tex ADDED Viewed

	@@ -0,0 +1,184 @@

+% CVPR 2023 Paper Template
+% based on the CVPR template provided by Ming-Ming Cheng (https://github.com/MCG-NKU/CVPR_Template)
+% modified and extended by Stefan Roth ([email protected])
+\documentclass[10pt,twocolumn,letterpaper]{article}
+%%%%%%%%% PAPER TYPE  - PLEASE UPDATE FOR FINAL VERSION
+% \usepackage[review]{cvpr}      % To produce the REVIEW version
+\usepackage{cvpr}              % To produce the CAMERA-READY version
+%\usepackage[pagenumbers]{cvpr} % To force page numbers, e.g. for an arXiv version
+\usepackage[accsupp]{axessibility}  % Improves PDF readability for those with disabilities.
+% Include other packages here, before hyperref.
+\usepackage[normalem]{ulem}
+\usepackage{graphicx}
+\usepackage{amsmath}
+\usepackage{amssymb}
+\usepackage{booktabs}
+\usepackage{xcolor}
+\usepackage{comment}
+\usepackage{enumitem}
+\usepackage{multirow}
+\usepackage{footnote}
+\newcommand{\todo}[1]{{\color{red}#1}}
+\newcommand{\benchmark}{WireSegHR}
+\makeatletter
+\newcommand\footnoteref[1]{\protected@xdef\@thefnmark{\ref{#1}}\@footnotemark}
+\makeatother
+% It is strongly recommended to use hyperref, especially for the review version.
+% hyperref with option pagebackref eases the reviewers' job.
+% Please disable hyperref *only* if you encounter grave issues, e.g. with the
+% file validation for the camera-ready version.
+%
+% If you comment hyperref and then uncomment it, you should delete
+% ReviewTempalte.aux before re-running LaTeX.
+% (Or just hit 'q' on the first LaTeX run, let it finish, and you
+%  should be clear).
+\usepackage[pagebackref,breaklinks,colorlinks]{hyperref}
+% Support for easy cross-referencing
+\usepackage[capitalize]{cleveref}
+\crefname{section}{Sec.}{Secs.}
+\Crefname{section}{Section}{Sections}
+\Crefname{table}{Table}{Tables}
+\crefname{table}{Tab.}{Tabs.}
+%%%%%%%%% PAPER ID  - PLEASE UPDATE
+\def\cvprPaperID{699} % *** Enter the CVPR Paper ID here
+\def\confName{CVPR}
+\def\confYear{2023}
+\makeatletter
+\def\@maketitle
+   {
+   \newpage
+   \null
+   \iftoggle{cvprrebuttal}{\vspace*{-.3in}}{\vskip .375in}
+   \begin{center}
+      % smaller title font only for rebuttal
+      \iftoggle{cvprrebuttal}{{\large \bf \@title \par}}{{\Large \bf \@title \par}}
+      % additional two empty lines at the end of the title
+      \iftoggle{cvprrebuttal}{\vspace*{-22pt}}{\vspace*{24pt}}
+      {
+      \large
+      \lineskip .5em
+      \begin{tabular}[t]{c}
+        \iftoggle{cvprfinal}{
+          \@author
+        }{
+          \iftoggle{cvprrebuttal}{}{
+            Anonymous \confName~submission\\
+            \vspace*{1pt}\\
+            Paper ID \cvprPaperID
+          }
+        }
+      \end{tabular}
+      \par
+      }
+      % additional small space at the end of the author name
+      % additional empty line at the end of the title block
+      \vspace*{-10mm}
+   \vspace{-5mm}
+   \end{center}
+   }
+\makeatother
+\title{Supplementary Material: Automatic High Resolution Wire Segmentation and Removal}
+\begin{document}
+%%%%%%%%% TITLE - PLEASE UPDATE
+% \title{Automatic Wire Segmentation for Inpainting}
+% \title{Segmentation to the Extreme: \\A Large-Scale Wire Segmentation Dataset and a Pilot Study}
+%\title{Segmentation to the Extreme: \\A Large-Scale High-Resolution Wire Segmentation Dataset and a Pilot Study}
+%\title{WINE: WIre NEver Appeared in Your Photos}
+\maketitle
+\thispagestyle{empty}
+\vspace{-1mm}
+\section{Comparison with Pixel 6}
+\vspace{-1mm}
+We show a visual comparison between our model and Pixel 6's ``Magic Eraser'' feature in Figure~\ref{fig:pixel6}. Without manual intervention, Google Pixel 6's ``Magic Eraser'' performs well on wires with clean background, but suffers from thin wires that are hardly visible ((A) upper), and also on wires with complicated background ((A) lower). We also pass our segmentation mask to our wire inpainting model to acquire the wire removal result, as shown in the lower image of (B).
+\vspace{-1mm}
+\section{Failure cases}
+\vspace{-1mm}
+We show some challenging cases where our model fails to predict accurate wire masks in Figure~\ref{fig:new_failures}. These include regions that are very similar to wires (top row), severe background blending (middle row) and extreme lighting conditions (bottom row).
+\vspace{-1mm}
+\section{Panorama}
+\vspace{-1mm}
+Our two-stage model leverages the sparsity of wires in natural images, and efficiently generalizes to ultra-high resolution images such as panoramas. We show one panoramic image of $11$K by $1.5$K resolution in Figure~\ref{fig:new_panorama}. Note that our method produces high-quality wire segmentation that covers wires that are almost invisible. As a result, our proposed wire removal step can effectively remove these regions.
+\input{figure_tex/pixel6.tex}
+\input{figure_tex/new_failure_cases.tex}
+\input{figure_tex/new_panorama.tex}
+\section{Segmentation and inpainting visualizations}
+\vspace{-1mm}
+We show our wire segmentation and inpainting results in several common photography scenes as well as in some challenging cases in Figure~\ref{fig:additional_visualizations}. We provide more visualizations of wire segmentation and subsequent inpainting results. Our model successfully handles numerous challenging scenarios, including strong backlit (top row), complex background texture (2nd row), low light (3rd row), and barely visible wires (4th row). A typical use case is shown in the last row.
+%We also provide 20 additional samples in the attached HTML \mbox{(\textit{additional\_visualizations.html})}.
+\section{Experiments on other datasets}
+\vspace{-1mm}
+Most existing wire-like datasets either are at low resolutions or are for specific purposes (e.g., aerial imaging) and thus do not contain the scene diversity like WireSegHR does. The suggested TTPLA~[2] dataset shares the Power Lines class with our dataset, although it only contains aerial images. Table~\ref{ttpla_exp} shows evaluation results of the TTPLA dataset on our model and also our WireSegHR dataset on the TTPLA model.
+% We provide a model performance comparison on this dataset using two experiments.
+% To abide by the request to refrain from significant additional experiments,
+% We first test our trained model on the TTPLA test set. We then test the trained TTPLA model on our WireSegHR-500 test set:
+\begin{table}[h!]
+\resizebox{\linewidth}{!}{
+    \centering
+    \begin{tabular}{c|c|c}
+        Dataset & Model & IoU (\%) \\\hline\hline
+        \multirow{3}{*}{TTPLA (Power Line only)} & TTPLA (ResNet-50, 700$\times$ 700) & 18.9 \\
+        & Ours (ResNet-50) & 33.1 \\
+        & Ours (MiT-b2) & 42.7 \\\hline
+        \multirow{3}{*}{WireSegHR} & TTPLA (ResNet-50, 700$\times$ 700) & 3.5 \\
+        & Ours (ResNet-50) & 47.8 \\
+        & Ours (MiT-b2) & 60.8\\\hline
+    \end{tabular}}
+\caption{Comparison with TTPLA.}
+\vspace{-5mm}
+\label{ttpla_exp}
+\end{table}
+% With our ResNet-50 model, MiT-b2 model, we obtain 33.1\% and 42.7\% wire IoU respectively, against 18.9\% from their original model.
+TTPLA is trained on fixed resolution (700 $\times$ 700) and takes in the entire image for inference, which requires significant downsampling of our test set.
+% may be fine for large structures such as transmission towers, but
+As a result, the quality of thin wires deteriorates in both the image and the label. Our model drops in performance on the TTPLA dataset due to different annotation definitions: we annotate all wire-like objects while TTPLA only annotates power lines.
+\vspace{-1mm}
+\section{Additional training details}
+\vspace{-1mm}
+\paragraph{CascadePSP~\cite{cascadepsp}}
+We follow the default training steps provided by the CascadePSP code\footnote{\label{note1}\href{https://github.com/hkchengrex/CascadePSP}{https://github.com/hkchengrex/CascadePSP}}. During training, we sample patches in the image that contain at least 1\% of wire pixels. During inference, we feed the predictions of the global DeepLabv3+ to the pretrained/retrained CascadePSP model to get the refined wire mask. In both cases, we follow the default inference code\footnoteref{note1} to obtain the final mask.
+\vspace{-3mm}
+\paragraph{MagNet~\cite{magnet}} MagNet\footnote{\href{https://github.com/VinAIResearch/MagNet}{https://github.com/VinAIResearch/MagNet}} obtains the initial mask predictions from a single backbone trained on all refinement scales. For a fair comparison, we adopt a 2-scale setting of MagNet, similar to our two-stage model, where the image is downsampled to $1024\times 1024$ in the global scale, and is kept at the original resolution in the local scale. To this end, we train a single DeepLabv3+ model by either downsampling the sample image to $1024\times 1024$ or randomly cropping $1024\times 1024$ patches at the original resolution. The sampled patches contain at least 1\% of wire pixels. We then train the refinement module based on the predictions from the DeepLabv3+ model, following the default setting. Inference is kept the same as the original MagNet model.
+\vspace{-2mm}
+\paragraph{ISDNet~\cite{isdnet}}
+ISDNet\footnote{\href{https://github.com/cedricgsh/ISDNet}{https://github.com/cedricgsh/ISDNet}} performs inference on the entire image without sliding window. As a result, during training, we resize all images to $5000\times 5000$ and randomly crop $2500\times 2500$ windows, such that the input images can fit inside the GPUs. Sampled patches should contain 1\% wire pixels. During inference, all images are resized to $5000\times 5000$. We observe that this yields better results than if we keep images below $5000\times 5000$ at their original sizes.
+\input{figure_tex/additional_visualizations.tex}
+{\small
+\bibliographystyle{ieee_fullname}
+\bibliography{egbib}
+}
+\end{document}

paper-tex/tables/component.tex ADDED Viewed

	@@ -0,0 +1,25 @@

+%auto-ignore
+\begin{table}[t!]
+\centering
+\resizebox{0.49\textwidth}{!}{
+    \renewcommand{\arraystretch}{1}
+    \addtolength{\tabcolsep}{-2pt}
+    \begin{tabular}{r|c|ccc|ccc}
+    \hline
+    Model & \begin{tabular}[x]{@{}c@{}}Wire\\IoU\end{tabular} & F1 & Precision & Recall & \begin{tabular}[x]{@{}c@{}}IoU\\(Small)\end{tabular} & \begin{tabular}[x]{@{}c@{}}IoU\\(Medium)\end{tabular} & \begin{tabular}[x]{@{}c@{}}IoU\\(Large)\end{tabular} \\\hline\hline
+    %& \begin{tabular}[x]{@{}c@{}}Speed\\(s/img)\end{tabular}  & \begin{tabular}[x]{@{}c@{}}Memory\\(GB)\end{tabular}\\ \hline\hline
+    Ours & 60.83 & 75.65 & 83.62 & 69.06 & 63.52 & 59.83 & 62.93 \\ \hline
+    -- MinMax & 60.01 & 75.01 & 84.87 & 67.2 & 63.67 & 58.99 & 61.97 \\
+    -- MaxPool & 59.86 & 74.89 & 85.25 & 66.78 & 61.45 & 59.40 & 60.76 \\
+    -- Coarse & 56.92 & 72.55 & 82.91 & 64.49 & 62.83 & 57.42 & 54.47 \\ \hline
+\end{tabular}
+\addtolength{\tabcolsep}{2pt}
+}
+\vspace{-1mm}
+\caption{Ablation study of our model components.
+% \yq{This table can be made one-column to save space.}\cezhang{+1, we're running out of space.}
+}
+\label{table:component_ablation}
+\vspace{-2mm}
+\end{table}

paper-tex/tables/inpaint.tex ADDED Viewed

	@@ -0,0 +1,30 @@

+%auto-ignore
+\begin{table}[t]\setlength{\tabcolsep}{5pt}
+\setlength{\abovecaptionskip}{8pt}
+\centering
+\footnotesize
+% \scriptsize
+% \tiny
+%\vspace{-2ex}
+% \resizebox{\columnwidth}{!}{
+\begin{tabular}{r|c c c|c}
+\hline
+Model &PSNR$\uparrow$&LPIPS$\downarrow$&FID$\downarrow$ &Speed (s/img)\\ \hline
+%Photoshop\\ %should be easy to run batch testing
+PatchMatch \cite{barnes2009patchmatch}&50.29 &0.0294 & 5.0403 & -\\
+DeepFillv2 \cite{yu2019free} &47.01 &0.0374&8.0086 &0.009\\
+CMGAN \cite{zheng2022cm} &50.07 &0.0255 &3.8286 &0.141\\
+FcF \cite{jain2022keys}&48.82&0.0322&4.7848&0.048\\
+LDM \cite{rombach2022high} & 45.96 & 0.0401& 10.1687 & 4.280\\
+Big-LaMa \cite{suvorov2022resolution} & 49.63 & 0.0267& 4.1245 &0.034\\
+Ours (LaMa-Wire) & 50.06 & 0.0259 & 3.6950 &0.034\\
+\hline
+\end{tabular}
+\vspace{-1mm}
+\caption{Quantitative results of inpainting on our synthetic wire inpainting evaluation dataset (1000 images). Our model achieves the highest perceptual quality in terms of FID, and has a balanced speed and quality.}
+% }
+\label{exp:wire_inp}
+% \vspace{-4mm}
+\end{table}

paper-tex/tables/logit.tex ADDED Viewed

	@@ -0,0 +1,18 @@

+%auto-ignore
+\begin{table}[h!]
+\centering
+\resizebox{0.98\linewidth}{!}{
+    \renewcommand{\arraystretch}{1.1}
+    \begin{tabular}{p{0.27\linewidth}|c|ccc}
+    \hline
+    Model & IoU (\%) & F1 (\%) & Precision (\%) & Recall (\%)\\ \hline\hline
+    Local logit only & 71.3 & 83.2 & 84.5 & \textbf{81.9} \\ \hline
+    Global logit + binary location map (ours) & \textbf{71.9} & \textbf{83.6} & \textbf{86.2} & 81.3\\ \hline
+\end{tabular}
+}
+\vspace{-1mm}
+\caption{Comparison between using only local logit map and global logit map as fine module input. By including the logit map of the entire image, our model avoids over-prediction. }
+\label{table:logit}
+\vspace{-2mm}
+\end{table}

paper-tex/tables/results.tex ADDED Viewed

	@@ -0,0 +1,53 @@

+%auto-ignore
+\begin{table*}[t!]
+\vspace{2mm}
+\centering
+\resizebox{0.98\textwidth}{!}{
+    \renewcommand{\arraystretch}{1.1}
+    \addtolength{\tabcolsep}{-2pt}
+    \begin{tabular}{r|c|ccc|ccc|ccc}
+    \hline
+    Model & \begin{tabular}[x]{@{}c@{}}Wire\\IoU\end{tabular} & F1 & Precision & Recall & \begin{tabular}[x]{@{}c@{}}IoU\\(Small)\end{tabular} & \begin{tabular}[x]{@{}c@{}}IoU\\(Medium)\end{tabular} & \begin{tabular}[x]{@{}c@{}}IoU\\(Large)\end{tabular} & \begin{tabular}[x]{@{}c@{}}Avg. Time\\(s/img)\end{tabular} & \begin{tabular}[x]{@{}c@{}}Min. Time\\(s/img)\end{tabular} & \begin{tabular}[x]{@{}c@{}}Max. Time\\(s/img)\end{tabular}\\ \hline\hline
+    DeepLabv3+ (Global)~\cite{deeplabv3p} & 37.77 & 54.83 & 69.68 & 45.20 & 51.62 & 38.89 & 31.89 & 0.22 & 0.07 & 0.78 \\
+    DeepLabv3+ (Local)~\cite{deeplabv3p} & 48.66 & 65.46 & 68.13 & 63.0 & 60.23 & 51.44 & 40.17 & 3.27 & 0.05 & 16.59 \\ \hline
+    CascadePSP (Pretrained)~\cite{cascadepsp} & 20.44 & 33.94 & 62.19 & 23.34 & 33.64 & 21.80 & 13.78 & 2.32 & 0.37 & 36.79 \\
+    CascadePSP (Retrained)~\cite{cascadepsp} & 26.85 & 42.33 & 52.44 & 35.49 & 48.22 & 28.97 & 15.80 & 2.25 & 0.37 & 25.37 \\
+    MagNet~\cite{magnet} & 33.71 & 50.42 & 87.69 & 35.38 & 43.59 & 32.67 & 34.48 & 3.89 & 0.54 & 17.97\\
+    MagNet-Fast~\cite{magnet} & 37.87 & 54.94 & 67.98 & 46.09 & 46.75 & 35.88 & 41.42 & 1.36 & 0.55 & 5.33 \\
+    ISDNet (R-18)~\cite{isdnet} & 46.52 & 63.50 & 77.56 & 53.75 & 55.09 & 47.15 & 43.34 & 0.29 & 0.12 & 0.86\\
+    ISDNet (MiT-b2)~\cite{isdnet} & 47.90 & 64.77 & 77.38 & 55.70 & 54.48 & 46.77 & 49.51 & 0.26 & 0.13 & 1.02 \\ \hline
+    Ours (R-50) & 47.75 & 64.64 & 74.86 & 56.87 & 60.68 & 50.19 & 38.19 & 1.24 & 0.13 & 4.67 \\
+    Ours (MiT-b2) & 60.83 & 75.65 & 83.62 & 69.06 & 63.52 & 59.83 & 62.93 & 0.82 & 0.07 & 3.36 \\ \hline
+\end{tabular}
+\addtolength{\tabcolsep}{2pt}
+}
+\vspace{-1mm}
+\caption{Performances of common semantic segmentation and recent high-resolution semantic segmentation models on our dataset. We find that our dataset poses many challenges that high-resolution segmentation models fail to tackle effectively.
+% \yq{This table can be made one-column to save space.}\cezhang{+1, we're running out of space.}
+}
+\label{table:results}
+\end{table*}
+% \begin{table}[t!]
+% \centering
+% \resizebox{0.49\textwidth}{!}{
+%     \renewcommand{\arraystretch}{1}
+%     \addtolength{\tabcolsep}{-2pt}
+%     \begin{tabular}{r|c|ccc|ccc}
+%     \hline
+%     Model & \begin{tabular}[x]{@{}c@{}}Wire\\IoU\end{tabular} & F1 & Precision & Recall & \begin{tabular}[x]{@{}c@{}}IoU\\(Small)\end{tabular} & \begin{tabular}[x]{@{}c@{}}IoU\\(Medium)\end{tabular} & \begin{tabular}[x]{@{}c@{}}IoU\\(Large)\end{tabular} \\\hline\hline
+%     %& \begin{tabular}[x]{@{}c@{}}Speed\\(s/img)\end{tabular}  & \begin{tabular}[x]{@{}c@{}}Memory\\(GB)\end{tabular}\\ \hline\hline
+%     Ours & 60.83 & 75.65 & 83.62 & 69.06 & 63.52 & 59.83 & 62.93 \\ \hline
+%     -- MinMax & 60.01 & 75.01 & 84.87 & 67.2 & 63.67 & 58.99 & 61.97 \\
+%     -- MaxPool & 59.86 & 74.89 & 85.25 & 66.78 & 61.45 & 59.40 & 60.76 \\
+%     -- Coarse & 56.92 & 72.55 & 82.91 & 64.49 & 62.83 & 57.42 & 54.47 \\ \hline
+% \end{tabular}
+% \addtolength{\tabcolsep}{2pt}
+% }
+% \vspace{-3mm}
+% \caption{Ablation study of our model components.
+% % \yq{This table can be made one-column to save space.}\cezhang{+1, we're running out of space.}
+% }
+% \label{table:component_ablation}
+% \vspace{-2mm}
+% \end{table}

paper-tex/tables/stats.tex ADDED Viewed

	@@ -0,0 +1,22 @@

+%auto-ignore
+\begin{table}[h!]
+\centering
+\resizebox{\linewidth}{!}{
+    \renewcommand{\arraystretch}{1.1}
+    \begin{tabular}{r|cccc}
+    \hline
+    Dataset & \begin{tabular}[x]{@{}c@{}}\# Wire\\Images\end{tabular}  & \begin{tabular}[x]{@{}c@{}}Min.\\Image Size\end{tabular}  &\begin{tabular}[x]{@{}c@{}}Max.\\Image Size\end{tabular}  &\begin{tabular}[x]{@{}c@{}}Median\\Image Size\end{tabular} \\ \hline
+    Powerline~\cite{powerlinedataset} & 2000 & 128$\times$128 & 128$\times$128 & 128$\times$128\\
+    PLDU~\cite{pldu} & 573 & 540$\times$360 & 540$\times$360 & 540$\times$360 \\
+    PLDM~\cite{pldu} & 287 & 540$\times$360 & 540$\times$360 & 540$\times$360 \\
+    TTPLA~\cite{ttpla} & 1100 & 3840$\times$2160 & 3840$\times$2160 & 3840$\times$2160\\ \hline
+    \textbf{Ours} & 6000 & 360$\times$240 & 15904$\times$10608 & 5040$\times$3360 \\ \hline
+\end{tabular}
+}
+\vspace{-2mm}
+\caption{Statistics of our wire dataset compared to others.}%Image and annotation statistics of our test set.\yq{You can make it as a 4x4 table, with each row: item, min, max, avg; and each column: item name, image size, wire thickness, and percentage. (mt: done)}}
+\vspace{-5mm}
+\label{table:stats}
+\end{table}

paper-tex/tables/thresholds.tex ADDED Viewed

	@@ -0,0 +1,22 @@

+%auto-ignore
+\begin{table}[t!]
+\centering
+\resizebox{0.9\columnwidth}{!}{
+    \renewcommand{\arraystretch}{1.1}
+    \begin{tabular}{r|c|ccc|cc}
+    \hline
+    $\alpha$& \begin{tabular}[x]{@{}c@{}}Wire\\IoU\end{tabular} & F1 & Precision & Recall & \begin{tabular}[x]{@{}c@{}}Avg. Time\\(s/img)\end{tabular} &
+    Speed up\\ \hline
+    0.0 & 60.97 & 75.75 & 82.63 & 69.93 & 1.91 & 1$\times$ \\
+    0.01 & 60.83 & 75.65 & 83.62 & 69.06 & 0.82 & 2.3$\times$ \\
+    0.02 & 60.35 & 75.27 & 83.97 & 68.20 & 0.75 & 2.5$\times$ \\
+    0.05 & 55.17 & 71.11 & 84.84 & 61.20 & 0.58 & 3.3$\times$ \\
+    0.1 & 42.44 & 59.59 & 86.06 & 45.57 & 0.4 & 4.8$\times$ \\
+    \hline
+\end{tabular}
+}
+\vspace{-1mm}
+\caption{Ablation on the threshold for refinement. At $\alpha=0.0$, all windows are passed to the fine module.}
+\label{table:thresholds}
+%\vspace{-2mm}
+\end{table}