본문 바로가기
카테고리 없음

dino v2 depth estimation

by pulluper 2024. 8. 30.
반응형

 

git clone https://github.com/facebookresearch/dinov2

 

GitHub - facebookresearch/dinov2: PyTorch code and models for the DINOv2 self-supervised learning method.

PyTorch code and models for the DINOv2 self-supervised learning method. - facebookresearch/dinov2

github.com

 

nano 다음 script

import math
import itertools
from functools import partial

import torch
import torch.nn.functional as F
from dinov2.eval.depth.models import build_depther



class CenterPadding(torch.nn.Module):
    def __init__(self, multiple):
        super().__init__()
        self.multiple = multiple

    def _get_pad(self, size):
        new_size = math.ceil(size / self.multiple) * self.multiple
        pad_size = new_size - size
        pad_size_left = pad_size // 2
        pad_size_right = pad_size - pad_size_left
        return pad_size_left, pad_size_right

    @torch.inference_mode()
    def forward(self, x):
        pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in x.shape[:1:-1]))
        output = F.pad(x, pads)
        return output

def create_depther(cfg, backbone_model, backbone_size, head_type):
    train_cfg = cfg.get("train_cfg")
    test_cfg = cfg.get("test_cfg")
    depther = build_depther(cfg.model, train_cfg=train_cfg, test_cfg=test_cfg)

    print(depther)

    depther.backbone.forward = partial(
        backbone_model.get_intermediate_layers,
        n=cfg.model.backbone.out_indices,
        reshape=True,
        return_class_token=cfg.model.backbone.output_cls_token,
        norm=cfg.model.backbone.final_norm,
    )

    if hasattr(backbone_model, "patch_size"):
        depther.backbone.register_forward_pre_hook(lambda _, x: CenterPadding(backbone_model.patch_size)(x[0]))

    return depther


BACKBONE_SIZE = "large" # in ("small", "base", "large" or "giant")


# model
# dinov2 model load
backbone_model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14').to('cuda')
backbone_model.eval()
backbone_model.cuda()

import mmcv
import urllib
from mmcv.runner import load_checkpoint

def load_config_from_url(url: str) -> str:
    with urllib.request.urlopen(url) as f:
        return f.read().decode()

HEAD_DATASET = "nyu" # in ("nyu", "kitti")
HEAD_TYPE = "dpt" # in ("linear", "linear4", "dpt")

# backbone_name = f"dinov2_{backbone_arch}"

backbone_name = 'dinov2_vitl14'

head_config_url = f"{DINOV2_BASE_URL}/{backbone_name}/{backbone_name}_{HEAD_DATASET}_{HEAD_TYPE}_config.py"
head_checkpoint_url = f"{DINOV2_BASE_URL}/{backbone_name}/{backbone_name}_{HEAD_DATASET}_{HEAD_TYPE}_head.pth"

cfg_str = load_config_from_url(head_config_url)
cfg = mmcv.Config.fromstring(cfg_str, file_format=".py")

model = create_depther(
    cfg,
    backbone_model=backbone_model,
    backbone_size=BACKBONE_SIZE,
    head_type=HEAD_TYPE,
)

checkpoints = load_checkpoint(model, head_checkpoint_url, map_location="cpu")




model.eval()
model.cuda()


import urllib
from PIL import Image

image = Image.open("000003.png").convert("RGB")
import matplotlib
from torchvision import transforms


def make_depth_transform() -> transforms.Compose:
    return transforms.Compose([
        transforms.ToTensor(),
        lambda x: 255.0 * x[:3], # Discard alpha component and scale by 255
        transforms.Normalize(
            mean=(123.675, 116.28, 103.53),
            std=(58.395, 57.12, 57.375),
        ),
    ])


def render_depth(values, colormap_name="magma_r") -> Image:
    min_value, max_value = values.min(), values.max()
    normalized_values = (values - min_value) / (max_value - min_value)

    colormap = matplotlib.colormaps[colormap_name]
    colors = colormap(normalized_values, bytes=True) # ((1)xhxwx4)
    colors = colors[:, :, :3] # Discard alpha component
    return Image.fromarray(colors)


transform = make_depth_transform()

scale_factor = 1
rescaled_image = image.resize((scale_factor * image.width, scale_factor * image.height))
transformed_image = transform(rescaled_image)
batch = transformed_image.unsqueeze(0).cuda() # Make a batch of one image

with torch.inference_mode():
    result = model.whole_inference(batch, img_meta=None, rescale=True)

depth_image = render_depth(result.squeeze().cpu())

# pil image save
depth_image.save("depth_image.jpg")

 

 

import math
import itertools
from functools import partial

import torch
import torch.nn.functional as F
from dinov2.eval.depth.models import build_depther
import os
import wget
import time

import mmcv
from mmcv.runner import load_checkpoint

from PIL import Image
import matplotlib
from torchvision import transforms

# data
class CenterPadding(torch.nn.Module):
    def __init__(self, multiple):
        super().__init__()
        self.multiple = multiple

    def _get_pad(self, size):
        new_size = math.ceil(size / self.multiple) * self.multiple
        pad_size = new_size - size
        pad_size_left = pad_size // 2
        pad_size_right = pad_size - pad_size_left
        return pad_size_left, pad_size_right

    @torch.inference_mode()
    def forward(self, x):
        pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in x.shape[:1:-1]))
        output = F.pad(x, pads)
        return output

# model
def create_depther(cfg, backbone_model, backbone_size, head_type):
    train_cfg = cfg.get("train_cfg")
    test_cfg = cfg.get("test_cfg")
    depther = build_depther(cfg.model, train_cfg=train_cfg, test_cfg=test_cfg)

    print(depther)

    depther.backbone.forward = partial(
        backbone_model.get_intermediate_layers,
        n=cfg.model.backbone.out_indices,
        reshape=True,
        return_class_token=cfg.model.backbone.output_cls_token,
        norm=cfg.model.backbone.final_norm,
    )

    if hasattr(backbone_model, "patch_size"):
        depther.backbone.register_forward_pre_hook(lambda _, x: CenterPadding(backbone_model.patch_size)(x[0]))

    return depther

def make_depth_transform() -> transforms.Compose:
    return transforms.Compose([
        transforms.ToTensor(),
        lambda x: 255.0 * x[:3], # Discard alpha component and scale by 255
        transforms.Normalize(
            mean=(123.675, 116.28, 103.53),
            std=(58.395, 57.12, 57.375),
        ),
    ])

def render_depth(values, colormap_name="magma_r") -> Image:
    min_value, max_value = values.min(), values.max()
    normalized_values = (values - min_value) / (max_value - min_value)

    colormap = matplotlib.colormaps[colormap_name]
    colors = colormap(normalized_values, bytes=True) # ((1)xhxwx4)
    colors = colors[:, :, :3] # Discard alpha component
    return Image.fromarray(colors)

if __name__ == "__main__":

    tic = time.time()

    # download model

    # download config and checkpoint
    cfg_path = 'dinov2_vitl14_nyu_dpt_config.py'
    checkpoint_path = 'dinov2_vitl14_nyu_dpt_head.pth'

    if not os.path.exists(cfg_path):
        wget.download(head_config_url)
    if not os.path.exists(checkpoint_path):
        wget.download(head_checkpoint_url)

    # backbone model load
    backbone_model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14').to('cuda')
    backbone_model.eval()
    backbone_model.cuda()

    with open(cfg_path, "r") as file:
        cfg_str = file.read()
    cfg = mmcv.Config.fromstring(cfg_str, file_format=".py")

    model = create_depther(
        cfg,
        backbone_model=backbone_model,
        backbone_size="large",
        head_type="dpt",
    )

    # load checkpoint
    load_checkpoint(model, checkpoint_path, map_location="cpu")
    model.eval()
    model.cuda()

    toc = time.time()
    print(f"Model loading time: {toc - tic:.2f}s")

    image = Image.open("224.png").convert("RGB")
    transform = make_depth_transform()
    scale_factor = 1
    rescaled_image = image.resize((224, 224))
    transformed_image = transform(rescaled_image)
   
    # batch = transformed_image.unsqueeze(0).cuda() # Make a batch of one image
    batchsize = 16
    batch = torch.cat([transformed_image.unsqueeze(0) for _ in range(batchsize)], dim=0).cuda()

    tic = time.time()
    with torch.inference_mode():
        result = model.whole_inference(batch, img_meta=None, rescale=True)

    toc = time.time()
    print(f"Model inference time: {toc - tic:.2f}s")
    # depth_image = render_depth(result.squeeze().cpu())

    # # pil image save
    # depth_image.save("depth_image_1.jpg")





반응형

댓글