dino v2 depth estimation

git clone https://github.com/facebookresearch/dinov2

GitHub - facebookresearch/dinov2: PyTorch code and models for the DINOv2 self-supervised learning method.

PyTorch code and models for the DINOv2 self-supervised learning method. - facebookresearch/dinov2

github.com

nano 다음 script

import math

import itertools

from functools import partial

import torch

import torch.nn.functional as F

from dinov2.eval.depth.models import build_depther

class CenterPadding(torch.nn.Module):

def __init__(self, multiple):

super().__init__()

self.multiple = multiple

def _get_pad(self, size):

new_size = math.ceil(size / self.multiple) * self.multiple

pad_size = new_size - size

pad_size_left = pad_size // 2

pad_size_right = pad_size - pad_size_left

return pad_size_left, pad_size_right

@torch.inference_mode()

def forward(self, x):

pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in x.shape[:1:-1]))

output = F.pad(x, pads)

return output

def create_depther(cfg, backbone_model, backbone_size, head_type):

train_cfg = cfg.get("train_cfg")

test_cfg = cfg.get("test_cfg")

depther = build_depther(cfg.model, train_cfg=train_cfg, test_cfg=test_cfg)

print(depther)

depther.backbone.forward = partial(

backbone_model.get_intermediate_layers,

n=cfg.model.backbone.out_indices,

reshape=True,

return_class_token=cfg.model.backbone.output_cls_token,

norm=cfg.model.backbone.final_norm,

)

if hasattr(backbone_model, "patch_size"):

depther.backbone.register_forward_pre_hook(lambda _, x: CenterPadding(backbone_model.patch_size)(x[0]))

return depther

BACKBONE_SIZE = "large" # in ("small", "base", "large" or "giant")

# model

# dinov2 model load

backbone_model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14').to('cuda')

backbone_model.eval()

backbone_model.cuda()

import mmcv

import urllib

from mmcv.runner import load_checkpoint

def load_config_from_url(url: str) -> str:

with urllib.request.urlopen(url) as f:

return f.read().decode()

HEAD_DATASET = "nyu" # in ("nyu", "kitti")

HEAD_TYPE = "dpt" # in ("linear", "linear4", "dpt")

# backbone_name = f"dinov2_{backbone_arch}"

backbone_name = 'dinov2_vitl14'

head_config_url = f"https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_nyu_dpt_config.py"

DINOV2_BASE_URL = "https://dl.fbaipublicfiles.com/dinov2"

head_config_url = f"{DINOV2_BASE_URL}/{backbone_name}/{backbone_name}_{HEAD_DATASET}_{HEAD_TYPE}_config.py"

head_checkpoint_url = f"{DINOV2_BASE_URL}/{backbone_name}/{backbone_name}_{HEAD_DATASET}_{HEAD_TYPE}_head.pth"

cfg_str = load_config_from_url(head_config_url)

cfg = mmcv.Config.fromstring(cfg_str, file_format=".py")

model = create_depther(

cfg,

backbone_model=backbone_model,

backbone_size=BACKBONE_SIZE,

head_type=HEAD_TYPE,

)

checkpoints = load_checkpoint(model, head_checkpoint_url, map_location="cpu")

model.eval()

model.cuda()

import urllib

from PIL import Image

image = Image.open("000003.png").convert("RGB")

import matplotlib

from torchvision import transforms

def make_depth_transform() -> transforms.Compose:

return transforms.Compose([

transforms.ToTensor(),

lambda x: 255.0 * x[:3], # Discard alpha component and scale by 255

transforms.Normalize(

mean=(123.675, 116.28, 103.53),

std=(58.395, 57.12, 57.375),

])

def render_depth(values, colormap_name="magma_r") -> Image:

min_value, max_value = values.min(), values.max()

normalized_values = (values - min_value) / (max_value - min_value)

colormap = matplotlib.colormaps[colormap_name]

colors = colormap(normalized_values, bytes=True) # ((1)xhxwx4)

colors = colors[:, :, :3] # Discard alpha component

return Image.fromarray(colors)

transform = make_depth_transform()

scale_factor = 1

rescaled_image = image.resize((scale_factor * image.width, scale_factor * image.height))

transformed_image = transform(rescaled_image)

batch = transformed_image.unsqueeze(0).cuda() # Make a batch of one image

with torch.inference_mode():

result = model.whole_inference(batch, img_meta=None, rescale=True)

depth_image = render_depth(result.squeeze().cpu())

# pil image save

depth_image.save("depth_image.jpg")

import math

import itertools

from functools import partial

import torch

import torch.nn.functional as F

from dinov2.eval.depth.models import build_depther

import os

import wget

import time

import mmcv

from mmcv.runner import load_checkpoint

from PIL import Image

import matplotlib

from torchvision import transforms

# data

class CenterPadding(torch.nn.Module):

def __init__(self, multiple):

super().__init__()

self.multiple = multiple

def _get_pad(self, size):

new_size = math.ceil(size / self.multiple) * self.multiple

pad_size = new_size - size

pad_size_left = pad_size // 2

pad_size_right = pad_size - pad_size_left

return pad_size_left, pad_size_right

@torch.inference_mode()

def forward(self, x):

pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in x.shape[:1:-1]))

output = F.pad(x, pads)

return output

# model

def create_depther(cfg, backbone_model, backbone_size, head_type):

train_cfg = cfg.get("train_cfg")

test_cfg = cfg.get("test_cfg")

depther = build_depther(cfg.model, train_cfg=train_cfg, test_cfg=test_cfg)

print(depther)

depther.backbone.forward = partial(

backbone_model.get_intermediate_layers,

n=cfg.model.backbone.out_indices,

reshape=True,

return_class_token=cfg.model.backbone.output_cls_token,

norm=cfg.model.backbone.final_norm,

)

if hasattr(backbone_model, "patch_size"):

depther.backbone.register_forward_pre_hook(lambda _, x: CenterPadding(backbone_model.patch_size)(x[0]))

return depther

def make_depth_transform() -> transforms.Compose:

return transforms.Compose([

transforms.ToTensor(),

lambda x: 255.0 * x[:3], # Discard alpha component and scale by 255

transforms.Normalize(

mean=(123.675, 116.28, 103.53),

std=(58.395, 57.12, 57.375),

])

def render_depth(values, colormap_name="magma_r") -> Image:

min_value, max_value = values.min(), values.max()

normalized_values = (values - min_value) / (max_value - min_value)

colormap = matplotlib.colormaps[colormap_name]

colors = colormap(normalized_values, bytes=True) # ((1)xhxwx4)

colors = colors[:, :, :3] # Discard alpha component

return Image.fromarray(colors)

if __name__ == "__main__":

tic = time.time()

# download model

head_config_url = f"https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_nyu_dpt_config.py"

head_checkpoint_url = f"https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_nyu_dpt_head.pth"

# download config and checkpoint

cfg_path = 'dinov2_vitl14_nyu_dpt_config.py'

checkpoint_path = 'dinov2_vitl14_nyu_dpt_head.pth'

if not os.path.exists(cfg_path):

wget.download(head_config_url)

if not os.path.exists(checkpoint_path):

wget.download(head_checkpoint_url)

# backbone model load

backbone_model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14').to('cuda')

backbone_model.eval()

backbone_model.cuda()

with open(cfg_path, "r") as file:

cfg_str = file.read()

cfg = mmcv.Config.fromstring(cfg_str, file_format=".py")

model = create_depther(

cfg,

backbone_model=backbone_model,

backbone_size="large",

head_type="dpt",

)

# load checkpoint

load_checkpoint(model, checkpoint_path, map_location="cpu")

model.eval()

model.cuda()

toc = time.time()

print(f"Model loading time: {toc - tic:.2f}s")

image = Image.open("224.png").convert("RGB")

transform = make_depth_transform()

scale_factor = 1

rescaled_image = image.resize((224, 224))

transformed_image = transform(rescaled_image)

# batch = transformed_image.unsqueeze(0).cuda() # Make a batch of one image

batchsize = 16

batch = torch.cat([transformed_image.unsqueeze(0) for _ in range(batchsize)], dim=0).cuda()

tic = time.time()

with torch.inference_mode():

result = model.whole_inference(batch, img_meta=None, rescale=True)

toc = time.time()

print(f"Model inference time: {toc - tic:.2f}s")

# depth_image = render_depth(result.squeeze().cpu())

# # pil image save

# depth_image.save("depth_image_1.jpg")

아무블로그

dino v2 depth estimation

댓글

티스토리툴바