|
|
<!DOCTYPE html> |
|
|
|
|
|
<html> |
|
|
<head> |
|
|
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> |
|
|
|
|
|
<meta name="description" content="SegGen"> |
|
|
<meta name="keywords" content="SegGen"> |
|
|
<meta name="viewport" content="width=device-width, initial-scale=1"> |
|
|
|
|
|
<style type="text/css">svg:not(:root).svg-inline--fa { |
|
|
overflow: visible |
|
|
} |
|
|
|
|
|
.svg-inline--fa { |
|
|
display: inline-block; |
|
|
font-size: inherit; |
|
|
height: 1em; |
|
|
overflow: visible; |
|
|
vertical-align: -.125em |
|
|
} |
|
|
|
|
|
.svg-inline--fa.fa-lg { |
|
|
vertical-align: -.225em |
|
|
} |
|
|
|
|
|
.svg-inline--fa.fa-w-1 { |
|
|
width: .0625em |
|
|
} |
|
|
|
|
|
.svg-inline--fa.fa-w-2 { |
|
|
width: .125em |
|
|
} |
|
|
|
|
|
.svg-inline--fa.fa-w-3 { |
|
|
width: .1875em |
|
|
} |
|
|
|
|
|
.svg-inline--fa.fa-w-4 { |
|
|
width: .25em |
|
|
} |
|
|
|
|
|
.svg-inline--fa.fa-w-5 { |
|
|
width: .3125em |
|
|
} |
|
|
|
|
|
.svg-inline--fa.fa-w-6 { |
|
|
width: .375em |
|
|
} |
|
|
|
|
|
.svg-inline--fa.fa-w-7 { |
|
|
width: .4375em |
|
|
} |
|
|
|
|
|
.svg-inline--fa.fa-w-8 { |
|
|
width: .5em |
|
|
} |
|
|
|
|
|
.svg-inline--fa.fa-w-9 { |
|
|
width: .5625em |
|
|
} |
|
|
|
|
|
.svg-inline--fa.fa-w-10 { |
|
|
width: .625em |
|
|
} |
|
|
|
|
|
.svg-inline--fa.fa-w-11 { |
|
|
width: .6875em |
|
|
} |
|
|
|
|
|
.svg-inline--fa.fa-w-12 { |
|
|
width: .75em |
|
|
} |
|
|
|
|
|
.svg-inline--fa.fa-w-13 { |
|
|
width: .8125em |
|
|
} |
|
|
|
|
|
.svg-inline--fa.fa-w-14 { |
|
|
width: .875em |
|
|
} |
|
|
|
|
|
.svg-inline--fa.fa-w-15 { |
|
|
width: .9375em |
|
|
} |
|
|
|
|
|
.svg-inline--fa.fa-w-16 { |
|
|
width: 1em |
|
|
} |
|
|
|
|
|
.svg-inline--fa.fa-w-17 { |
|
|
width: 1.0625em |
|
|
} |
|
|
|
|
|
.svg-inline--fa.fa-w-18 { |
|
|
width: 1.125em |
|
|
} |
|
|
|
|
|
.svg-inline--fa.fa-w-19 { |
|
|
width: 1.1875em |
|
|
} |
|
|
|
|
|
.svg-inline--fa.fa-w-20 { |
|
|
width: 1.25em |
|
|
} |
|
|
|
|
|
.svg-inline--fa.fa-pull-left { |
|
|
margin-right: .3em; |
|
|
width: auto |
|
|
} |
|
|
|
|
|
.svg-inline--fa.fa-pull-right { |
|
|
margin-left: .3em; |
|
|
width: auto |
|
|
} |
|
|
|
|
|
.svg-inline--fa.fa-border { |
|
|
height: 1.5em |
|
|
} |
|
|
|
|
|
.svg-inline--fa.fa-li { |
|
|
width: 2em |
|
|
} |
|
|
|
|
|
.svg-inline--fa.fa-fw { |
|
|
width: 1.25em |
|
|
} |
|
|
|
|
|
.fa-layers svg.svg-inline--fa { |
|
|
bottom: 0; |
|
|
left: 0; |
|
|
margin: auto; |
|
|
position: absolute; |
|
|
right: 0; |
|
|
top: 0 |
|
|
} |
|
|
|
|
|
.fa-layers { |
|
|
display: inline-block; |
|
|
height: 1em; |
|
|
position: relative; |
|
|
text-align: center; |
|
|
vertical-align: -.125em; |
|
|
width: 1em |
|
|
} |
|
|
|
|
|
.fa-layers svg.svg-inline--fa { |
|
|
-webkit-transform-origin: center center; |
|
|
transform-origin: center center |
|
|
} |
|
|
|
|
|
.fa-layers-counter, .fa-layers-text { |
|
|
display: inline-block; |
|
|
position: absolute; |
|
|
text-align: center |
|
|
} |
|
|
|
|
|
.fa-layers-text { |
|
|
left: 50%; |
|
|
top: 50%; |
|
|
-webkit-transform: translate(-50%, -50%); |
|
|
transform: translate(-50%, -50%); |
|
|
-webkit-transform-origin: center center; |
|
|
transform-origin: center center |
|
|
} |
|
|
|
|
|
.fa-layers-counter { |
|
|
background-color: #ff253a; |
|
|
border-radius: 1em; |
|
|
-webkit-box-sizing: border-box; |
|
|
box-sizing: border-box; |
|
|
color: #fff; |
|
|
height: 1.5em; |
|
|
line-height: 1; |
|
|
max-width: 5em; |
|
|
min-width: 1.5em; |
|
|
overflow: hidden; |
|
|
padding: .25em; |
|
|
right: 0; |
|
|
text-overflow: ellipsis; |
|
|
top: 0; |
|
|
-webkit-transform: scale(.25); |
|
|
transform: scale(.25); |
|
|
-webkit-transform-origin: top right; |
|
|
transform-origin: top right |
|
|
} |
|
|
|
|
|
.fa-layers-bottom-right { |
|
|
bottom: 0; |
|
|
right: 0; |
|
|
top: auto; |
|
|
-webkit-transform: scale(.25); |
|
|
transform: scale(.25); |
|
|
-webkit-transform-origin: bottom right; |
|
|
transform-origin: bottom right |
|
|
} |
|
|
|
|
|
.fa-layers-bottom-left { |
|
|
bottom: 0; |
|
|
left: 0; |
|
|
right: auto; |
|
|
top: auto; |
|
|
-webkit-transform: scale(.25); |
|
|
transform: scale(.25); |
|
|
-webkit-transform-origin: bottom left; |
|
|
transform-origin: bottom left |
|
|
} |
|
|
|
|
|
.fa-layers-top-right { |
|
|
right: 0; |
|
|
top: 0; |
|
|
-webkit-transform: scale(.25); |
|
|
transform: scale(.25); |
|
|
-webkit-transform-origin: top right; |
|
|
transform-origin: top right |
|
|
} |
|
|
|
|
|
.fa-layers-top-left { |
|
|
left: 0; |
|
|
right: auto; |
|
|
top: 0; |
|
|
-webkit-transform: scale(.25); |
|
|
transform: scale(.25); |
|
|
-webkit-transform-origin: top left; |
|
|
transform-origin: top left |
|
|
} |
|
|
|
|
|
.fa-lg { |
|
|
font-size: 1.3333333333em; |
|
|
line-height: .75em; |
|
|
vertical-align: -.0667em |
|
|
} |
|
|
|
|
|
.fa-xs { |
|
|
font-size: .75em |
|
|
} |
|
|
|
|
|
.fa-sm { |
|
|
font-size: .875em |
|
|
} |
|
|
|
|
|
.fa-1x { |
|
|
font-size: 1em |
|
|
} |
|
|
|
|
|
.fa-2x { |
|
|
font-size: 2em |
|
|
} |
|
|
|
|
|
.fa-3x { |
|
|
font-size: 3em |
|
|
} |
|
|
|
|
|
.fa-4x { |
|
|
font-size: 4em |
|
|
} |
|
|
|
|
|
.fa-5x { |
|
|
font-size: 5em |
|
|
} |
|
|
|
|
|
.fa-6x { |
|
|
font-size: 6em |
|
|
} |
|
|
|
|
|
.fa-7x { |
|
|
font-size: 7em |
|
|
} |
|
|
|
|
|
.fa-8x { |
|
|
font-size: 8em |
|
|
} |
|
|
|
|
|
.fa-9x { |
|
|
font-size: 9em |
|
|
} |
|
|
|
|
|
.fa-10x { |
|
|
font-size: 10em |
|
|
} |
|
|
|
|
|
.fa-fw { |
|
|
text-align: center; |
|
|
width: 1.25em |
|
|
} |
|
|
|
|
|
.fa-ul { |
|
|
list-style-type: none; |
|
|
margin-left: 2.5em; |
|
|
padding-left: 0 |
|
|
} |
|
|
|
|
|
.fa-ul > li { |
|
|
position: relative |
|
|
} |
|
|
|
|
|
.fa-li { |
|
|
left: -2em; |
|
|
position: absolute; |
|
|
text-align: center; |
|
|
width: 2em; |
|
|
line-height: inherit |
|
|
} |
|
|
|
|
|
.fa-border { |
|
|
border: solid .08em #eee; |
|
|
border-radius: .1em; |
|
|
padding: .2em .25em .15em |
|
|
} |
|
|
|
|
|
.fa-pull-left { |
|
|
float: left |
|
|
} |
|
|
|
|
|
.fa-pull-right { |
|
|
float: right |
|
|
} |
|
|
|
|
|
.fa.fa-pull-left, .fab.fa-pull-left, .fal.fa-pull-left, .far.fa-pull-left, .fas.fa-pull-left { |
|
|
margin-right: .3em |
|
|
} |
|
|
|
|
|
.fa.fa-pull-right, .fab.fa-pull-right, .fal.fa-pull-right, .far.fa-pull-right, .fas.fa-pull-right { |
|
|
margin-left: .3em |
|
|
} |
|
|
|
|
|
.fa-spin { |
|
|
-webkit-animation: fa-spin 2s infinite linear; |
|
|
animation: fa-spin 2s infinite linear |
|
|
} |
|
|
|
|
|
.fa-pulse { |
|
|
-webkit-animation: fa-spin 1s infinite steps(8); |
|
|
animation: fa-spin 1s infinite steps(8) |
|
|
} |
|
|
|
|
|
@-webkit-keyframes fa-spin { |
|
|
0% { |
|
|
-webkit-transform: rotate(0); |
|
|
transform: rotate(0) |
|
|
} |
|
|
100% { |
|
|
-webkit-transform: rotate(360deg); |
|
|
transform: rotate(360deg) |
|
|
} |
|
|
} |
|
|
|
|
|
@keyframes fa-spin { |
|
|
0% { |
|
|
-webkit-transform: rotate(0); |
|
|
transform: rotate(0) |
|
|
} |
|
|
100% { |
|
|
-webkit-transform: rotate(360deg); |
|
|
transform: rotate(360deg) |
|
|
} |
|
|
} |
|
|
|
|
|
.fa-rotate-90 { |
|
|
-webkit-transform: rotate(90deg); |
|
|
transform: rotate(90deg) |
|
|
} |
|
|
|
|
|
.fa-rotate-180 { |
|
|
-webkit-transform: rotate(180deg); |
|
|
transform: rotate(180deg) |
|
|
} |
|
|
|
|
|
.fa-rotate-270 { |
|
|
-webkit-transform: rotate(270deg); |
|
|
transform: rotate(270deg) |
|
|
} |
|
|
|
|
|
.fa-flip-horizontal { |
|
|
-webkit-transform: scale(-1, 1); |
|
|
transform: scale(-1, 1) |
|
|
} |
|
|
|
|
|
.fa-flip-vertical { |
|
|
-webkit-transform: scale(1, -1); |
|
|
transform: scale(1, -1) |
|
|
} |
|
|
|
|
|
.fa-flip-both, .fa-flip-horizontal.fa-flip-vertical { |
|
|
-webkit-transform: scale(-1, -1); |
|
|
transform: scale(-1, -1) |
|
|
} |
|
|
|
|
|
:root .fa-flip-both, :root .fa-flip-horizontal, :root .fa-flip-vertical, :root .fa-rotate-180, :root .fa-rotate-270, :root .fa-rotate-90 { |
|
|
-webkit-filter: none; |
|
|
filter: none |
|
|
} |
|
|
|
|
|
.fa-stack { |
|
|
display: inline-block; |
|
|
height: 2em; |
|
|
position: relative; |
|
|
width: 2.5em |
|
|
} |
|
|
|
|
|
.fa-stack-1x, .fa-stack-2x { |
|
|
bottom: 0; |
|
|
left: 0; |
|
|
margin: auto; |
|
|
position: absolute; |
|
|
right: 0; |
|
|
top: 0 |
|
|
} |
|
|
|
|
|
.svg-inline--fa.fa-stack-1x { |
|
|
height: 1em; |
|
|
width: 1.25em |
|
|
} |
|
|
|
|
|
.svg-inline--fa.fa-stack-2x { |
|
|
height: 2em; |
|
|
width: 2.5em |
|
|
} |
|
|
|
|
|
.fa-inverse { |
|
|
color: #fff |
|
|
} |
|
|
|
|
|
.sr-only { |
|
|
border: 0; |
|
|
clip: rect(0, 0, 0, 0); |
|
|
height: 1px; |
|
|
margin: -1px; |
|
|
overflow: hidden; |
|
|
padding: 0; |
|
|
position: absolute; |
|
|
width: 1px |
|
|
} |
|
|
|
|
|
.sr-only-focusable:active, .sr-only-focusable:focus { |
|
|
clip: auto; |
|
|
height: auto; |
|
|
margin: 0; |
|
|
overflow: visible; |
|
|
position: static; |
|
|
width: auto |
|
|
} |
|
|
|
|
|
.svg-inline--fa .fa-primary { |
|
|
fill: var(--fa-primary-color, currentColor); |
|
|
opacity: 1; |
|
|
opacity: var(--fa-primary-opacity, 1) |
|
|
} |
|
|
|
|
|
.svg-inline--fa .fa-secondary { |
|
|
fill: var(--fa-secondary-color, currentColor); |
|
|
opacity: .4; |
|
|
opacity: var(--fa-secondary-opacity, .4) |
|
|
} |
|
|
|
|
|
.svg-inline--fa.fa-swap-opacity .fa-primary { |
|
|
opacity: .4; |
|
|
opacity: var(--fa-secondary-opacity, .4) |
|
|
} |
|
|
|
|
|
.svg-inline--fa.fa-swap-opacity .fa-secondary { |
|
|
opacity: 1; |
|
|
opacity: var(--fa-primary-opacity, 1) |
|
|
} |
|
|
|
|
|
.svg-inline--fa mask .fa-primary, .svg-inline--fa mask .fa-secondary { |
|
|
fill: #000 |
|
|
} |
|
|
|
|
|
.fad.fa-inverse { |
|
|
color: #fff |
|
|
}</style> |
|
|
|
|
|
|
|
|
<title> |
|
|
SUM: Uncertainty-aware Fine-tuning of Segmentation Foundation Models |
|
|
</title> |
|
|
|
|
|
<script> |
|
|
window.dataLayer = window.dataLayer || []; |
|
|
|
|
|
function gtag() { |
|
|
dataLayer.push(arguments); |
|
|
} |
|
|
|
|
|
gtag('js', new Date()); |
|
|
|
|
|
gtag('config', 'G-EDF010G6PN'); |
|
|
|
|
|
|
|
|
</script> |
|
|
|
|
|
|
|
|
<script src="files/jquery.min.js"></script> |
|
|
<script type="text/javascript" src="files/jquery-1.11.0.min.js"></script> |
|
|
<script type="text/javascript" src="files/jquery-migrate-1.2.1.min.js"></script> |
|
|
<script src="files/interact.min.js"></script> |
|
|
|
|
|
<link href="files/css" rel="stylesheet"> |
|
|
|
|
|
<link rel="stylesheet" type="text/css" href="files/slick.css"> |
|
|
<link rel="stylesheet" type="text/css" href="files/slick-theme.css"> |
|
|
|
|
|
<link rel="stylesheet" href="files/bulma.min.css"> |
|
|
<link rel="stylesheet" href="files/bulma-slider.min.css"> |
|
|
<link rel="stylesheet" href="files/fontawesome.all.min.css"> |
|
|
<link rel="stylesheet" href="files/academicons.min.css"> |
|
|
<link rel="stylesheet" href="files/index.css"> |
|
|
|
|
|
<script defer="" src="files/fontawesome.all.min.js"></script> |
|
|
<script src="files/bulma-slider.min.js"></script> |
|
|
<script src="files/index.js"></script> |
|
|
|
|
|
</head> |
|
|
<body style=""> |
|
|
|
|
|
<nav class="navbar" role="navigation" aria-label="main navigation"> |
|
|
<div class="navbar-brand"> |
|
|
<a role="button" class="navbar-burger" aria-label="menu" aria-expanded="false"> |
|
|
<span aria-hidden="true"></span> |
|
|
<span aria-hidden="true"></span> |
|
|
<span aria-hidden="true"></span> |
|
|
</a> |
|
|
</div> |
|
|
<div class="navbar-menu"> |
|
|
<div class="navbar-start" style="flex-grow: 1; justify-content: center;"> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
</div> |
|
|
|
|
|
</div> |
|
|
</nav> |
|
|
|
|
|
|
|
|
<section class="hero"> |
|
|
<div class="hero-body"> |
|
|
<div class="container"> |
|
|
<div class="columns is-centered"> |
|
|
|
|
|
|
|
|
|
|
|
</div> |
|
|
<div class="container has-text-centered"> |
|
|
<h1 class="title is-1 publication-title"> |
|
|
<span style="color: #711c3d"> Uncertainty-aware Fine-tuning of Segmentation Foundation Models (NeurIPS 2024)</span> |
|
|
</h1> |
|
|
<div class="is-size-5 publication-authors"> |
|
|
<div class="author-block"> |
|
|
<a href="https://kangning-liu.github.io/">Kangning Liu</a><sup>1,2</sup>, |
|
|
</div> |
|
|
<div class="author-block"> |
|
|
<a href="https://research.adobe.com/person/brian-price/">Brian Price</a><sup>2</sup>, |
|
|
</div> |
|
|
<div class="author-block"> |
|
|
<a href="https://research.adobe.com/person/jason-kuen/">Jason Kuen</a><sup>2</sup>, |
|
|
</div> |
|
|
<div class="author-block"> |
|
|
<a href="https://openreview.net/profile?id=~Yifei_Fan1">Yifei Fan</a><sup>2</sup>, |
|
|
</div> |
|
|
<div class="author-block"> |
|
|
<a href="https://scholar.google.com/citations?user=8l3bFYYAAAAJ&hl=en">Zijun Wei</a><sup>2</sup>, |
|
|
</div> |
|
|
<div class="author-block"> |
|
|
<a href="https://luisf.me/">Luis Figueroa</a><sup>2</sup>, |
|
|
</div> |
|
|
<div class="author-block"> |
|
|
<a href="https://cs.nyu.edu/~kgeras/">Krzysztof J. Geras</a><sup>1</sup>, |
|
|
</div> |
|
|
<div class="author-block"> |
|
|
<a href="https://math.nyu.edu/~cfgranda/">Carlos Fernandez-Granda</a><sup>1</sup>, |
|
|
</div> |
|
|
<div class="is-size-5 publication-authors"> |
|
|
<span class="author-block"><sup>1</sup>New York University</span> |
|
|
<span class="author-block"><sup>2</sup>Adobe</span> |
|
|
</div> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<div class="column has-text-centered"> |
|
|
<div class="publication-links"> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<span class="link-block"> |
|
|
|
|
|
<a href="https://openreview.net/pdf?id=qNXRXUC90b" class="external-link button is-normal is-rounded is-dark"> |
|
|
<span class="icon"> |
|
|
<svg class="svg-inline--fa fa-file-pdf fa-w-12" aria-hidden="true" focusable="false" |
|
|
data-prefix="fas" data-icon="file-pdf" role="img" xmlns="http://www.w3.org/2000/svg" |
|
|
viewBox="0 0 384 512" data-fa-i2svg=""><path fill="currentColor" |
|
|
d="M181.9 256.1c-5-16-4.9-46.9-2-46.9 8.4 0 7.6 36.9 2 46.9zm-1.7 47.2c-7.7 20.2-17.3 43.3-28.4 62.7 18.3-7 39-17.2 62.9-21.9-12.7-9.6-24.9-23.4-34.5-40.8zM86.1 428.1c0 .8 13.2-5.4 34.9-40.2-6.7 6.3-29.1 24.5-34.9 40.2zM248 160h136v328c0 13.3-10.7 24-24 24H24c-13.3 0-24-10.7-24-24V24C0 10.7 10.7 0 24 0h200v136c0 13.2 10.8 24 24 24zm-8 171.8c-20-12.2-33.3-29-42.7-53.8 4.5-18.5 11.6-46.6 6.2-64.2-4.7-29.4-42.4-26.5-47.8-6.8-5 18.3-.4 44.1 8.1 77-11.6 27.6-28.7 64.6-40.8 85.8-.1 0-.1.1-.2.1-27.1 13.9-73.6 44.5-54.5 68 5.6 6.9 16 10 21.5 10 17.9 0 35.7-18 61.1-61.8 25.8-8.5 54.1-19.1 79-23.2 21.7 11.8 47.1 19.5 64 19.5 29.2 0 31.2-32 19.7-43.4-13.9-13.6-54.3-9.7-73.6-7.2zM377 105L279 7c-4.5-4.5-10.6-7-17-7h-6v128h128v-6.1c0-6.3-2.5-12.4-7-16.9zm-74.1 255.3c4.1-2.7-2.5-11.9-42.8-9 37.1 15.8 42.8 9 42.8 9z"></path></svg> |
|
|
|
|
|
</span> |
|
|
<span>Paper</span> |
|
|
</a> |
|
|
</span> |
|
|
|
|
|
|
|
|
|
|
|
<span class="link-block"> |
|
|
<a href="https://github.com/Kangningthu/SUM" |
|
|
class="external-link button is-normal is-rounded is-dark"> |
|
|
<span class="icon"> |
|
|
<i class="fab fa-github"></i> |
|
|
</span> |
|
|
<span>Github</span> |
|
|
</a> |
|
|
</span> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
</section> |
|
|
|
|
|
<section class="hero teaser"> |
|
|
<div class="hero-body"> |
|
|
<div class="container is-max-desktop"> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<img src="files/overall_result.jpg" height="100%"> |
|
|
<h2 class="subtitle has-text"> |
|
|
<font color="#9e2e23"><b><i>Segmentation with Uncertainty Model (SUM)</i></b></font> improves SAM |
|
|
without forgetting to ''segment anything.'' |
|
|
<br> |
|
|
<b>Left:</b> Both HQ-SAM and SUM show qualitative improvements over SAM, particularly in salient-object |
|
|
segmentation of complex structures (top row). HQ-SAM, however, struggles with background entities |
|
|
(middle row) and part segmentation (bottom row), often erroneously prioritizing objects in the |
|
|
foreground or entire objects. |
|
|
<br> |
|
|
<b>Right:</b> SUM consistently outperforms SAM and HQ-SAM in quantitative comparisons, achieving the |
|
|
highest mean boundary IoU across diverse evaluation sets and interactive segmentation rounds. |
|
|
</h2> |
|
|
</div> |
|
|
</div> |
|
|
</section> |
|
|
|
|
|
|
|
|
<section class="section"> |
|
|
<div class="container is-max-desktop"> |
|
|
|
|
|
|
|
|
<div class="columns is-centered has-text-centered"> |
|
|
<div class="column is-full"> |
|
|
<h2 class="title is-3">Abstract</h2> |
|
|
<div class="content has-text-justified" style="font-size: 20px;"> |
|
|
<p> |
|
|
The Segment Anything Model (SAM) is a large-scale foundation model that has revolutionized |
|
|
segmentation methodology. Despite its impressive generalization ability, the segmentation |
|
|
accuracy of SAM on images with intricate structures is often unsatisfactory. Recent works have |
|
|
proposed lightweight fine-tuning using high-quality annotated data to improve accuracy on such |
|
|
images. However, here we provide extensive empirical evidence that this strategy leads to |
|
|
forgetting how to "segment anything": these models lose the original generalization abilities of |
|
|
SAM, in the sense that they perform worse for segmentation tasks not represented in the |
|
|
annotated fine-tuning set. |
|
|
</p> |
|
|
<p> |
|
|
To improve performance without forgetting, we introduce a novel framework that combines |
|
|
high-quality annotated data with a large unlabeled dataset. The framework relies on two |
|
|
methodological innovations. First, we quantify the uncertainty in the SAM pseudo labels |
|
|
associated with the unlabeled data and leverage it to perform uncertainty-aware fine-tuning. |
|
|
Second, we encode the type of segmentation task associated with each training example using a |
|
|
task prompt to reduce ambiguity. |
|
|
</p> |
|
|
<p> |
|
|
We evaluated the proposed Segmentation with Uncertainty Model (SUM) on a diverse test set |
|
|
consisting of 14 public benchmarks, where it achieves state-of-the-art results. Notably, our |
|
|
method consistently surpasses SAM by 3-6 points in mean IoU and 4-7 in mean boundary IoU across |
|
|
point-prompt interactive segmentation rounds. |
|
|
</p> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
</div> |
|
|
</section> |
|
|
|
|
|
|
|
|
<section class="hero teaser"> |
|
|
<div class="hero-body"> |
|
|
<div class="container is-max-desktop"> |
|
|
<div class="columns is-centered has-text-centered"> |
|
|
<h2 class="title is-3 centered">Framework</h2> |
|
|
</div> |
|
|
<div class="container is-max-desktop"> |
|
|
<img src="files/unifiedpseudoannotedtrainingv12.png" height="100%"> |
|
|
<h2 class="subtitle has-text"> |
|
|
<b>Framework of SUM </b>: |
|
|
<b>Top</b>: When processing human-annotated examples, interactive prompts are sampled based on the |
|
|
binary-mask labels and fed iteratively into the model along with the image. Since this binary mask |
|
|
depends on the type of segmentation task desired by the user, SUM incorporates a task prompt that |
|
|
specifies the task relevant to each annotation (1 for salient-object segmentation and 2 for entity |
|
|
segmentation). |
|
|
<br> |
|
|
<b>Bottom</b>: For unlabeled images, the iterative prompts are sampled based on model-generated |
|
|
binary pseudo-labels, which may be inaccurate. SUM includes an uncertainty-quantification module |
|
|
that processes the pseudo-labels, generating an uncertainty map. This map is leveraged within an |
|
|
uncertainty-aware loss function used for training, and also informs how the interactive prompts are |
|
|
sampled. For all unlabeled data, the task prompt is set to 0. |
|
|
</h2> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
</section> |
|
|
|
|
|
|
|
|
<section class="hero teaser"> |
|
|
<div class="hero-body"> |
|
|
<div class="container is-max-desktop"> |
|
|
<div class="columns is-centered has-text-centered"> |
|
|
<h2 class="title is-3 centered">Generation of Uncertainty Map</h2> |
|
|
</div> |
|
|
<div class="container is-max-desktop"> |
|
|
<img src="files/uncertaintyquantification.png" height="100%"> |
|
|
<h2 class="subtitle has-text"> |
|
|
<b>Generation of uncertainty maps</b>: (1) The mask-refinement module receives as input the |
|
|
segmentation prediction produced by SAM. (2) The module produces a refined segmentation mask. (3) |
|
|
The uncertainty map equals the absolute difference between the SAM and refined predictions. |
|
|
</h2> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
</section> |
|
|
|
|
|
|
|
|
<section class="hero teaser"> |
|
|
<div class="hero-body"> |
|
|
<div class="container is-max-desktop"> |
|
|
<div class="columns is-centered has-text-centered"> |
|
|
<h2 class="title is-3 centered">Better Quality</h2> |
|
|
</div> |
|
|
|
|
|
<div class="block"> |
|
|
<h2 class="subtitle has-text"> |
|
|
Comparative visualization of segmentation outcomes using single-box prompts. |
|
|
</h2> |
|
|
<div style="display: grid; place-items: center;"> |
|
|
<img src="files/example3.jpg" style="width: 85%; height: auto;"></div> |
|
|
</div> |
|
|
<hr> |
|
|
<div class="block"> |
|
|
<h2 class="subtitle has-text"> |
|
|
Comparative visualization of segmentation outcomes using point prompts, where blue points signify |
|
|
positive prompts and red points indicate negative prompts. We adhere to the same point prompt |
|
|
sampling evaluation strategy as SAM. |
|
|
</h2> |
|
|
<div style="display: grid; place-items: center;"> |
|
|
<img src="files/example2.jpg" style="width: 85%; height: auto;"></div> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
</section> |
|
|
|
|
|
|
|
|
<section class="hero teaser"> |
|
|
<div class="hero-body"> |
|
|
<div class="container is-max-desktop"> |
|
|
<div class="columns is-centered has-text-centered"> |
|
|
<h2 class="title is-3 centered">Dataset</h2> |
|
|
</div> |
|
|
<h2 class="subtitle has-text"> |
|
|
Fine-tuning under different human annotation budget: FT-Small, FT-Medium, FT-Large |
|
|
</h2> |
|
|
<img src="files/fig_dataset.jpg" height="100%"> |
|
|
</div> |
|
|
</div> |
|
|
</section> |
|
|
|
|
|
|
|
|
<section class="hero teaser"> |
|
|
<div class="hero-body"> |
|
|
<div class="container is-max-desktop"> |
|
|
<div class="columns is-centered has-text-centered"> |
|
|
<h2 class="title is-3 centered">Experiments</h2> |
|
|
</div> |
|
|
|
|
|
<div class="block"> |
|
|
<h2 class="subtitle has-text"> |
|
|
<b> Comparison of HQ-SAM with Vanilla and SUM fine-tuned Using the Same Lightweight |
|
|
Scheme as HQ-SAM</b> SUM Matches HQ-SAM and outperforms Vanilla in salient-object segmentation |
|
|
and is superior in entity and part segmentation. |
|
|
</h2> |
|
|
<div style="display: grid; place-items: center;"> |
|
|
<img src="files/merged_iou_clean_HQSeg-44k-f1.png" style="width: 85%; height: auto;"> |
|
|
</div> |
|
|
</div> |
|
|
<hr> |
|
|
|
|
|
<div class="block"> |
|
|
<h2 class="subtitle has-text"> |
|
|
<b> Comparison with Other Light-weight Fine-tuning Methods</b> single point-prompt segmentation mIoU |
|
|
for SUM versus models |
|
|
fine-tuned using various strategies on the HQSeg-44K dataset. All competing models improve on the |
|
|
salient-object segmentation task associated with this dataset but deteriorate on other segmentation |
|
|
tasks.</h2> |
|
|
<div style="display: grid; place-items: center;"> |
|
|
<img src="files/table_lw_ft.jpg" style="width: 50%; height: auto;"> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
<hr> |
|
|
<div class="block"> |
|
|
<h2 class="subtitle has-text"> |
|
|
<b>Comparison with Semi-supervised Methods</b> 3 point-prompt segmentation evaluation of models |
|
|
fine-tuned on FT-Small dataset with various strategies. SUM clearly outperforms all other |
|
|
strategies. |
|
|
</h2> |
|
|
<img src="files/Semi_3point_seg_w_sum.png"> |
|
|
|
|
|
</div> |
|
|
|
|
|
<hr> |
|
|
|
|
|
<div class="block"> |
|
|
<h2 class="subtitle has-text"> |
|
|
<b> Comparison of SAM with SUM Fine-tuned Under Different Human Annotation Budget</b> 5 |
|
|
point-prompt |
|
|
segmentation evaluation. SUM consistently outperforms SAM, showing even greater improvement as the |
|
|
budget of human-annotated data increases. |
|
|
</h2> |
|
|
<img src="files/table_sum_diff_budgets.jpg" style="width: 100%; height: auto;"> |
|
|
</div> |
|
|
|
|
|
<hr> |
|
|
|
|
|
<div class="block"> |
|
|
<h2 class="subtitle has-text"> |
|
|
<b> Additional Evaluation</b> To test the generalization ability of SUM to a broader range of |
|
|
segmentation tasks, we provided 8 additional datasets. |
|
|
The mIoU comparison results, reported in the following tables, confirm that SUM consistently |
|
|
outperforms |
|
|
SAM. For reproducibility, SUM is fine-tuned on the Public dataset FT-Medium. |
|
|
</h2> |
|
|
<div style="display: grid; place-items: center;"> |
|
|
<img src="files/table_additional_evaluation.jpg" style="width: 65%; height: auto;"> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
<hr> |
|
|
|
|
|
<div class="block"> |
|
|
<h2 class="subtitle has-text"> |
|
|
<b>Ablation Study</b>. This table reports interactive segmentation mean IoU of different ablated |
|
|
versions of SUM fine-tuned on FT-Medium, showing individual gains provided by uncertainty-aware |
|
|
fine-tuning and task prompts. |
|
|
</h2> |
|
|
<img src="files/table_ablation.jpg"> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
</section> |
|
|
|
|
|
|
|
|
<section class="section" id="BibTeX"> |
|
|
<div class="container content is-max-desktop"> |
|
|
<h2 class="title">BibTeX</h2> |
|
|
<pre><code>@inproceedings{ |
|
|
liu2024uncertaintyaware, |
|
|
title={Uncertainty-aware Fine-tuning of Segmentation Foundation Models}, |
|
|
author={Kangning Liu and Brian L. Price and Jason Kuen and Yifei Fan and Zijun Wei and Luis Figueroa and Krzysztof J. Geras and Carlos Fernandez-Granda}, |
|
|
booktitle={The Thirty-eighth Annual Conference on Neural Information Processing Systems}, |
|
|
year={2024}, |
|
|
url={https://openreview.net/forum?id=qNXRXUC90b} |
|
|
} |
|
|
</code></pre> |
|
|
</div> |
|
|
</section> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<section class="hero teaser"> |
|
|
<div class="hero-body"> |
|
|
<div class="container is-max-desktop"> |
|
|
<div class="columns is-centered has-text-centered"> |
|
|
<h2 class="title is-3 centered">Acknowledgements</h2> |
|
|
</div> |
|
|
<h2 class="subtitle has-text"> |
|
|
The authors acknowledge Markus Woodson for valuable discussions and feedback. |
|
|
</h2> |
|
|
</div> |
|
|
</div> |
|
|
</section> |
|
|
|
|
|
|
|
|
|
|
|
<section class="section" id="acknowledgements"> |
|
|
<div class="container content is-max-desktop"> |
|
|
The website template was adapted from |
|
|
<a href="https://seggenerator.github.io/">SegGen</a>. |
|
|
</div> |
|
|
</section> |
|
|
|
|
|
|
|
|
<script type="text/javascript" src="files/slick.min.js"></script> |
|
|
|
|
|
|
|
|
</body> |
|
|
</html> |